Packages required for installation
# Installing Libraries
!pip install teradatasql
!pip install teradata
!pip install pyodbc
Requirement already satisfied: teradatasql in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (17.10.0.2) Requirement already satisfied: pycryptodome in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (from teradatasql) (3.11.0) Requirement already satisfied: teradata in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (15.10.0.21) Requirement already satisfied: pyodbc in /anaconda/envs/azureml_py36/lib/python3.6/site-packages (4.0.0-unsupported)
# Importing Libraries
import teradatasql
import teradata
import pandas as pd
import getpass
import pyodbc
import matplotlib.pyplot as plt
import matplotlib
import seaborn as sns
import numpy as np
from numpy import mean
from numpy import std
import statsmodels.api as sm
import itertools
import uszipcode
from uszipcode import SearchEngine
from geopy.geocoders import Nominatim
from gcmap import GCMapper, Gradient
from statsmodels.tsa.seasonal import seasonal_decompose
from pylab import rcParams
from matplotlib.colors import Normalize, LinearSegmentedColormap, PowerNorm
import plotly.graph_objects as go
import geopandas as gpd
from scipy import stats
from scipy.special import boxcox1p
from scipy.stats import boxcox_normmax
from scipy.stats import skew
from scipy.stats.mstats import winsorize
from scipy.stats import zscore
Teradata Connection
# Connection to teradata with credentials
user = "704135"
password = getpass.getpass()
server = "edtdp.aa.com"
# Defining the connection
con = teradatasql.connect(host=server, user=user, password=password, logmech='LDAP')
Data Pull
# Data pull SQL statement
query = """ SELECT
t1.PARTY_ID, /*unique customer id*/
t1.LYLTY_ACCT_ID, /*loyalty program customer id*/
CASE WHEN t1.LYLTY_ACCT_ID IS NOT NULL THEN 1 ELSE 0 END AS loyalty_Ind,
t1.LYLTY_LEVEL_CD, /*loyalty program category - loyalty customers only*/
t3.MACRO_SEG_ID,
t3.INDVDL_AGE_QTY,
t3.GENDER_CD,
CASE WHEN t3.POSTAL_CD='' THEN null ELSE t3.POSTAL_CD END AS POSTAL_CD,
t3.BKGS_QTY,
t3.AADVAN_MBR_TTL_YEARS_QTY,
t3.AA_OPER_REVNUE_SEG_AMT,
t3.AA_OPER_REVNUE_SEG_QTY,
t3.ACRU_AEM_QTY,
t3.ACRU_FLIGHT_AEM_QTY,
t3.ACRU_FLIGHT_SEG_QTY,
t3.BARCLAYS_CARD_HOLDER_IND,
t3.BKGS_INTL_QTY,
COALESCE(CEILING(t3.BKGS_PAX_QTY_AVG),1) AS BKGS_PAX_QTY_AVG,
CEILING(t3.CHECK_BAGS_PER_TRIP_AVG) AS CHECK_BAGS_PER_TRIP_AVG,
t3.CITIBNK_PREMIM_CARD_HLDR_IND,
t3.FLOWN_SEG_QTY,
t3.LOUNGE_MBR_PRTCPNT_IND,
t3.TRIP_QTY,
CASE
WHEN t3.AA_OPER_REVNUE_SEG_AMT>= 2500 THEN 'High Value'
WHEN t3.AA_OPER_REVNUE_SEG_AMT < 2500 AND t3.BKGS_QTY>1 THEN 'Medium Value'
WHEN t3.AA_OPER_REVNUE_SEG_AMT = 0 OR t3.BKGS_QTY<=1 THEN 'Value'
ELSE 'No Valuation' END AS aa_value_code,
EXTRACT(MONTH FROM t1.TICKET_ISSUE_DT) AS booking_month,
t1.PNR_LOCTR_ID, /*PNR number*/
t1.PNR_CREATE_DT, /*PNR created date*/
t1.TICKET_NBR, /*ticket number*/
t1.TICKET_ISSUE_DT, /*ticket issued date*/
t1.SEG_DEP_DT, /*segment departure date*/
t1.SEG_DEP_DT-t1.TICKET_ISSUE_DT AS days_before_dept,
TD_DAY_OF_WEEK(t1.SEG_DEP_DT) AS day_of_week_dep,
EXTRACT(MONTH FROM t1.SEG_DEP_DT) AS dep_month,
t1.SEG_DEP_AIRPRT_IATA_CD,
t1.SEG_ARVL_AIRPRT_IATA_CD,
t1.MKT_N_DIRECTN_AIRPRT_PAIR_CD, /*pair of locations*/
CASE WHEN t1.DEP_CNTRY_CD IN ('US','CA','GB','MX') THEN t1.DEP_CNTRY_CD ELSE 'Other' END AS DEP_CNTRY_CD, /*dept country*/
CASE WHEN t1.ARVL_CNTRY_CD IN ('US','CA','GB','MX') THEN t1.ARVL_CNTRY_CD ELSE 'Other' END AS ARVL_CNTRY_CD, /*arriv country*/
CASE
WHEN t1.DEP_CNTRY_CD NOT IN 'US' THEN 'INTL'
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('AEX','BHM','CHA','CLE','DAY','FSD','JAN','MCI','MSN','TYS') THEN 'FOCUS'
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('CLT','DFW','EYW','FLL','MIA','PBI','PHL','PHX') THEN 'MAJOR HUB' ----> Major HUB
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('LAX','JFK','ORD') THEN 'MAJOR HUB MULTP' ---> Major HUB for multiple airlines
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('ATL','DEN','DTW','EWR','HOU','MSP','OAK','SBA','SEA','SFO','SJC','SJO','SLC') THEN 'OTH AIRL HUB' ---> Major HUB for another airline
ELSE 'SPOKE' END AS AIRPRT_IATA_GRP,
CASE
WHEN t1.DEP_CNTRY_CD NOT IN 'US' THEN 'INTL'
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('AEX','BHM','CHA','CLE','DAY','FSD','JAN','MCI','MSN','TYS') THEN 'FOCUS'
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('CLT','DFW','EYW','FLL','MIA','PBI','PHL','PHX') THEN t1.SEG_DEP_AIRPRT_IATA_CD ----> Major HUB
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('LAX','JFK','ORD') THEN t1.SEG_DEP_AIRPRT_IATA_CD ---> Major HUB for multiple airlines
WHEN t1.SEG_DEP_AIRPRT_IATA_CD IN ('ATL','DEN','DTW','EWR','HOU','MSP','OAK','SBA','SEA','SFO','SJC','SJO','SLC') THEN t1.SEG_DEP_AIRPRT_IATA_CD ---> Major HUB for another airline
ELSE 'SPOKE' END AS AIRPRT_IATA_TOP,
t1.TACTL_ENTITY, /*do we need this*/
t1.SEG_INTRVL_TYPE, /*segment interval type*/
t1.MILE_GREAT_CIRCLE_DISTANC_QTY,
s2.SEG_GEOG_AREA_CD,
s2.SEG_GEOG_AREA_DESC,
t1.CABIN_BOOKED, /*Y - main cabin, W - premium economy, C = business, F = first*/
t1.CABIN_FLOWN, /*Y - main cabin, W - premium economy, C = business, F = first*/
CASE WHEN t1.UPG_TYPE IS NULL THEN 'NONE' ELSE t1.UPG_TYPE END AS UPG_TYPE,
CASE WHEN t1.UPG_TYPE IS NOT NULL THEN 1 ELSE 0 END AS upgrade_ind,
t1.ACCT_FARE_CLASS_CD,
CASE
WHEN t1.ACCT_FARE_CLASS_CD IN ('F','A') THEN 'First Class'
WHEN t1.ACCT_FARE_CLASS_CD IN ('J','R','D','I') THEN 'Business Class'
WHEN t1.ACCT_FARE_CLASS_CD IN ('W','P') THEN 'Premium Economy'
WHEN t1.ACCT_FARE_CLASS_CD IN ('B') THEN 'Basic Economy' ELSE 'Economy' END AS ACCT_FARE_CLASS_TYPE,
t1.TKT_TYPE, /*ind awd ticket R revenue , A awd, convert or pmpute*/
t1.BX_IND,
CASE WHEN t2.BKG_BUSINES_LEISR_IND IS NULL THEN 'L' ELSE t2.BKG_BUSINES_LEISR_IND END AS BKG_BUSINES_LEISR_IND,
CASE WHEN t1.FLIGHT_REV + COALESCE(s2.ancll_rev_ttl,0) >0 THEN 1 ELSE 0 END AS Spend_Ind,
t1.FLIGHT_REV, /*customer can use awr miles, customer segment level, prorated*/
CASE WHEN s2.ancll_rev_ttl >= 1 THEN 1 ELSE 0 END AS ancillary_spend_ind,
COALESCE(s2.ancll_rev_ttl,0) AS ancll_rev_ttl,
COALESCE(s2.ancll_qty_ttl,0) AS ancll_qty_ttl,
COALESCE(s2.rev_pref_seats,0) AS rev_pref_seats,
COALESCE(s2.qty_pref_seats,0) AS qty_pref_seats,
COALESCE(s2.rev_bag,0) AS rev_bag,
COALESCE(s2.qty_bag,0) AS qty_bag,
COALESCE(s2.rev_bag_non_cat,0) AS rev_bag_non_cat,
COALESCE(s2.qty_bag_non_cat,0) AS qty_bag_non_cat,
COALESCE(s2.rev_mileage_multiplier,0) AS rev_mileage_multiplier,
COALESCE(s2.qty_mileage_multiplier,0) AS qty_mileage_multiplier,
COALESCE(s2.rev_tsa,0) AS rev_tsa,
COALESCE(s2.qty_tsa,0) AS qty_tsa,
COALESCE(s2.rev_upgrade,0) AS rev_upgrade,
COALESCE(s2.qty_upgrade,0) AS qty_upgrade,
COALESCE(s2.ancll_rev_ttl,0)-COALESCE(s2.rev_pref_seats,0)-COALESCE(s2.rev_bag,0)-COALESCE(s2.rev_bag_non_cat,0)-COALESCE(s2.rev_mileage_multiplier,0)-COALESCE(s2.rev_tsa,0)-COALESCE(s2.rev_upgrade,0) AS rev_other_non_cat,
COALESCE(s2.ancll_qty_ttl,0)-COALESCE(s2.qty_pref_seats,0)-COALESCE(s2.qty_bag,0)-COALESCE(s2.qty_bag_non_cat,0)-COALESCE(s2.qty_mileage_multiplier,0)-COALESCE(s2.qty_tsa,0)-COALESCE(s2.qty_upgrade,0) AS qty_other_non_cat
--COALESCE(s2.rev_other_non_cat,0) + COALESCE(s2.rev_non_cat_other,0) AS rev_other_non_cat,
--COALESCE(s2.qty_other_non_cat,0) + COALESCE(s2.qty_non_cat_other,0) AS qty_other_non_cat,
--COALESCE(s2.rev_other_non_cat,0) + COALESCE(s2.rev_non_cat_other,0) AS rev_other_non_cat,
--COALESCE(s2.qty_other_non_cat,0) + COALESCE(s2.qty_non_cat_other,0) AS qty_other_non_cat,
FROM PROD_LYLTY_TRANS_VW.CUST_FLIGHT_ACTVTY_TRANS t1
LEFT JOIN
----------------------------AGGREGATE TABLE FOR MERCH AS S2------------------------------------------------
(SELECT
s1.party_id,
s1.ticket_nbr,
s1.ticket_issue_dt,
s1.SEG_ARVL_AIRPRT_IATA_CD,
s1.SEG_DEP_AIRPRT_IATA_CD,
s1.SEG_GEOG_AREA_CD,
s1.SEG_GEOG_AREA_DESC,
SUM(s1.ANCLRY_PROD_PRICE_USD_AMT) AS ancll_rev_ttl,
SUM(s1.ANCLRY_PROD_SLS_QTY) AS ancll_qty_ttl,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'SA' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_pref_seats,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'SA' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_pref_seats,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'BG' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_bag,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'BG' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_bag,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '0' AND s1.ANCLRY_PROD_COMERCL_NM LIKE ANY ('UP100%','UP50%','UP70%') THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_bag_non_cat,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '0' AND s1.ANCLRY_PROD_COMERCL_NM LIKE ANY ('UP100%','UP50%','UP70%') THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_bag_non_cat,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '0' AND s1.ANCLRY_PROD_COMERCL_NM NOT LIKE ALL ('UP100%','UP50%','UP70%') THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_other_non_cat,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '0' AND s1.ANCLRY_PROD_COMERCL_NM NOT LIKE ALL ('UP100%','UP50%','UP70%') THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_other_non_cat,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'FF' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_mileage_multiplier,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'FF' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_mileage_multiplier,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'TS' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_tsa,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'TS' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_tsa,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'UP' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_upgrade,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = 'UP' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_upgrade,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '99' THEN s1.ANCLRY_PROD_PRICE_USD_AMT ELSE 0 END) AS rev_non_cat_other,
SUM(CASE WHEN s1.ANCLRY_PROD_GROUP_CD = '99' THEN s1.ANCLRY_PROD_SLS_QTY ELSE 0 END) AS qty_non_cat_other
FROM
----------------------------MERCH TABLES UNION AS S1--------------------------------
(SELECT
t1.ANCLRY_PROD_COMERCL_NM, /*group by product type*/
t1.ANCLRY_PROD_PRICE_USD_AMT, /*ancly price*/
t1.ANCLRY_PROD_SLS_QTY, /*quantity*/
--t1.ANCLRY_SLS_AIRLN_ACCT_CD, /*all fields are AA - remove field and add as filter*/
--t1.ANCLRY_SLS_CHANL_CD, /*channel - we can remove this field?*/
--t1.ANCLRY_SLS_ISSUE_DT, /*change to days before departure*/
--t1.DOMNT_AIRLN_IATA_CD, /*dominant airline code - mostly aa - can we drop this too or filter to AA*/
t1.LYLTY_ACCT_ID, /*lylty account id*/
t1.PARTY_ID, /*party id*/
t1.PNR_CREATE_DT, /*pnr create dt*/
t1.PNR_LOCTR_ID, /*pnr lctr id*/
--t1.REV_ACCT_PAYMNT_TYPE_CD, /*payment type - can we drop this as almost all is cc*/
t1.SEG_ARVL_AIRPRT_IATA_CD, /*arrival city*/
t1.SEG_DEP_AIRPRT_IATA_CD, /*dept city*/
t1.SEG_DEP_DT, /*dept date - all null*/
t1.SEG_GEOG_AREA_CD, /*geo code*/
t1.SEG_GEOG_AREA_DESC, /*geo desc*/
t1.TCN_NBR, /*transaction number - check if its needed*/
t1.TICKET_ISSUE_DT, /*ticket issue dt*/
t1.TICKET_NBR, /*ticket number*/
t1.TKT_CART_NBR, /*mostly null*/
--t1.TKT_MKT_AIRLN_IATA_CD, /*should we limit to AA?*/
CAST (000 AS FLOAT) AS ANCLRY_SLS_AMT_CONV_USD,
CAST(000 AS VARCHAR(20)) AS PRAS_DOC_STATUS_DESC,
CAST(000 AS VARCHAR(20)) AS ANCLRY_PROD_GROUP_CD
--CAST(000 AS VARCHAR(20)) AS SPCL_SRVC_CD
FROM PROD_MERCH_VW.MERCH_ANCLRY_NON_CTLG_SLS t1
--WHERE 1=1
--AND t1.ANCLRY_SLS_AIRLN_ACCT_CD = '001'
--AND COALESCE(t1.ANCLRY_PRD_PRICE_LCL_CRNCY_CD,0) = 'USD'
--AND t1.PARTY_ID = '159011984'
--AND t1.ticket_nbr = '2386605942'
--AND t1.PNR_LOCTR_ID IS NOT NULL
--AND t1.SEG_DEP_DT BETWEEN CURRENT_DATE-360 AND CURRENT_DATE
UNION ALL
SELECT
t2.ANCLRY_PROD_COMERCL_NM, /*product name*/
t2.ANCLRY_PROD_PRICE_USD_AMT, /*ancly price*/
t2.ANCLRY_PROD_SLS_QTY, /*quantity*/
--t2.ANCLRY_SLS_AIRLN_ACCT_CD, /*all fields are AA - remove field and add as filter*/
--t2.ANCLRY_SLS_CHANL_CD, /*channel - keep*/
--t2.ANCLRY_SLS_ISSUE_DT, /*change to days before departure*/
--CAST(000 AS VARCHAR(20)) AS DOMNT_AIRLN_IATA_CD, /*drop*/
t2.LYLTY_ACCT_ID, /*start with*/
t2.PARTY_ID, /*lylty and non lylty*/
t2.PNR_CREATE_DT, /*pnr create dt*/
t2.PNR_LOCTR_ID, /*pnr lctr id*/
--t2.REV_ACCT_PAYMNT_TYPE_CD, /*all cc*/
t2.SEG_ARVL_AIRPRT_IATA_CD, /*arrival city*/
t2.SEG_DEP_AIRPRT_IATA_CD, /*dept city*/
t2.SEG_DEP_DT, /*dept date - all null*/
t2.SEG_GEOG_AREA_CD, /*geo code*/
t2.SEG_GEOG_AREA_DESC, /*geo desc*/
t2.TCN_NBR, /*transaction number - check if its needed*/
t2.TICKET_ISSUE_DT, /*ticket issue dt*/
t2.TICKET_NBR, /*ticket number*/
t2.TKT_CART_NBR, /*keep*/
--t2.TKT_MKT_AIRLN_IATA_CD, /*should we limit to AA?*/
t2.ANCLRY_SLS_AMT_CONV_USD, /*sales amount usd*/
t2.PRAS_DOC_STATUS_DESC, /*status*/
t2.ANCLRY_PROD_GROUP_CD /*program code*/
--t2.SPCL_SRVC_CD /*seat or null*/
FROM PROD_MERCH_VW.MERCH_ANCLRY_SLS t2
--WHERE 1=1
--AND t2.PNR_LOCTR_ID IS NOT NULL
--AND t2.ANCLRY_SLS_AIRLN_ACCT_CD = '001'
--AND COALESCE(t2.PRAS_DOC_STATUS_DESC,'OTHER') <> 'REFUNDED'
--AND t2.SEG_DEP_DT BETWEEN CURRENT_DATE-360 AND CURRENT_DATE
--AND COALESCE(t2.ANCLRY_PRD_PRICE_LCL_CRNCY_CD,0) = 'USD'
--AND t2.PARTY_ID = '159011984'
--AND t2.ticket_nbr = '2386605942'
)s1
---------------------------GROUP BY FOR MERCH TABLE S2-----------------------
GROUP BY 1,2,3,4,5,6,7)s2
---------------------------JOIN TO LYLTY TRANSACTION TABLE----------------
ON t1.TICKET_NBR = s2.TICKET_NBR
AND t1.TICKET_ISSUE_DT = s2.TICKET_ISSUE_DT
AND t1.PARTY_ID = s2.PARTY_ID
AND t1.SEG_DEP_AIRPRT_IATA_CD = s2.SEG_DEP_AIRPRT_IATA_CD
AND t1.SEG_ARVL_AIRPRT_IATA_CD = s2.SEG_ARVL_AIRPRT_IATA_CD
LEFT JOIN PROD_PNR_BOOKING_VWS.PNR_BUSINES_LEISR_TAG_ML t2
ON t1.PNR_LOCTR_ID = t2.PNR_LOCTR_ID
AND t1.PNR_CREATE_DT = t2.PNR_CREATE_DT
AND t1.PNR_CREATE_TM = t2.PNR_CREATE_TM
INNER JOIN
(SELECT *
FROM PROD_CUST_CORE_METRICS_VW.INDVDL_CUST_PROFIL WHERE AADVAN_MBR_IND = 'Y' AND ANALYZ_START_DT <= '2020-03-31' QUALIFY ROW_NUMBER() OVER (PARTITION BY PARTY_ID ORDER BY ANALYZ_START_DT DESC)=1) t3
ON t1.PARTY_ID = t3.PARTY_ID
--------------------------- WHERE STATEMENT--------------------------------
WHERE 1=1
AND t1.PNR_LOCTR_ID IS NOT NULL
AND t1.SEG_DEP_DT BETWEEN '2019-01-01' AND '2019-12-31'
AND t1.MKT_AIRLN_IATA_CD = 'AA' /*limit to aa - both marketing, operating, accounting*/
AND t1.OPERAT_AIRLN_IATA_CD = 'AA' /*limit to aa - both marketing, operating, accounting*/
AND t1.ACCT_AIRLN_IATA_CD = 'AA' /*limit to aa - both marketing, operating, accounting*/
AND t1.SEG_INTRVL_TYPE IS NOT NULL
AND t1.COUPON_STATUS_CD = 'USED'
AND t1.LYLTY_ACCT_ID IS NOT NULL
AND t1.TKT_TYPE IN ('A','R')
AND (CASE WHEN t3.POSTAL_CD='' THEN null ELSE t3.POSTAL_CD END) IS NOT null
AND t1.SEG_DEP_DT-t1.TICKET_ISSUE_DT >= 0
AND t3.INDVDL_AGE_QTY IS NOT NULL
SAMPLE 1000;"""
# Reading data from sql to dataframe
df = pd.read_sql(query, con)
# set limits to output
pd.set_option('display.max_rows', 100)
pd.set_option('display.max_columns', 100)
# data sample
df.head()
| PARTY_ID | LYLTY_ACCT_ID | loyalty_Ind | LYLTY_LEVEL_CD | MACRO_SEG_ID | INDVDL_AGE_QTY | GENDER_CD | POSTAL_CD | BKGS_QTY | AADVAN_MBR_TTL_YEARS_QTY | AA_OPER_REVNUE_SEG_AMT | AA_OPER_REVNUE_SEG_QTY | ACRU_AEM_QTY | ACRU_FLIGHT_AEM_QTY | ACRU_FLIGHT_SEG_QTY | BARCLAYS_CARD_HOLDER_IND | BKGS_INTL_QTY | BKGS_PAX_QTY_AVG | CHECK_BAGS_PER_TRIP_AVG | CITIBNK_PREMIM_CARD_HLDR_IND | FLOWN_SEG_QTY | LOUNGE_MBR_PRTCPNT_IND | TRIP_QTY | aa_value_code | booking_month | PNR_LOCTR_ID | PNR_CREATE_DT | TICKET_NBR | TICKET_ISSUE_DT | SEG_DEP_DT | days_before_dept | day_of_week_dep | dep_month | SEG_DEP_AIRPRT_IATA_CD | SEG_ARVL_AIRPRT_IATA_CD | MKT_N_DIRECTN_AIRPRT_PAIR_CD | DEP_CNTRY_CD | ARVL_CNTRY_CD | AIRPRT_IATA_GRP | AIRPRT_IATA_TOP | TACTL_ENTITY | SEG_INTRVL_TYPE | MILE_GREAT_CIRCLE_DISTANC_QTY | SEG_GEOG_AREA_CD | SEG_GEOG_AREA_DESC | CABIN_BOOKED | CABIN_FLOWN | UPG_TYPE | upgrade_ind | ACCT_FARE_CLASS_CD | ACCT_FARE_CLASS_TYPE | TKT_TYPE | BX_IND | BKG_BUSINES_LEISR_IND | Spend_Ind | FLIGHT_REV | ancillary_spend_ind | ancll_rev_ttl | ancll_qty_ttl | rev_pref_seats | qty_pref_seats | rev_bag | qty_bag | rev_bag_non_cat | qty_bag_non_cat | rev_mileage_multiplier | qty_mileage_multiplier | rev_tsa | qty_tsa | rev_upgrade | qty_upgrade | rev_other_non_cat | qty_other_non_cat | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1.481332e+11 | 0D30MH6 | 1 | G | 1 | 54 | M | 19701 | 8.5 | 23.0 | 2646.99 | 17.0 | 40475.0 | 19200.5 | 17.0 | Y | 1.5 | 3.0 | 16.0 | N | 17.0 | N | 15.0 | High Value | 8 | QBBXIF | 2019-07-10 | 7.405893e+09 | 2019-08-30 | 2019-09-04 | 5 | 4 | 9 | PHL | SFO | PHLSFO | US | US | MAJOR HUB | PHL | NE-H2H ... | D | 2521 | None | None | Y | Y | NONE | 0 | Q | Economy | R | N | B | 1 | 169.30 | 0 | 0.00 | 0 | 0.00 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 |
| 1 | 5.454613e+09 | 788EVT2 | 1 | R | 3 | 39 | M | 75215 | 12.5 | 13.0 | 3279.76 | 18.5 | 18461.5 | 18461.5 | 19.0 | N | 2.5 | 2.0 | 6.0 | N | 18.5 | N | 19.5 | High Value | 10 | AESCRA | 2019-10-25 | 2.385458e+09 | 2019-10-25 | 2019-11-22 | 28 | 6 | 11 | DFW | CLE | CLEDFW | US | US | MAJOR HUB | DFW | MA-VA/WV/PA/OH ... | D | 1021 | D | US48 | Y | Y | NONE | 0 | B | Basic Economy | R | N | L | 1 | 133.95 | 1 | 37.82 | 1 | 37.82 | 1 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 |
| 2 | 1.151162e+10 | AMY5812 | 1 | P | 1 | 61 | M | 60120 | 14.5 | 32.0 | 3131.88 | 16.0 | 61140.0 | 25108.0 | 16.0 | N | 0.0 | 2.0 | 52.0 | Y | 16.0 | N | 15.0 | High Value | 9 | MBHGQB | 2019-09-06 | 7.454238e+09 | 2019-09-06 | 2019-09-09 | 3 | 2 | 9 | CLT | GSP | CLTGSP | US | US | MAJOR HUB | CLT | S-SOUTH ... | D | 75 | None | None | Y | Y | NONE | 0 | L | Economy | R | N | B | 1 | 94.64 | 0 | 0.00 | 0 | 0.00 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 |
| 3 | 5.692366e+10 | 3F7UX02 | 1 | R | 4 | 28 | F | 02472 | 5.5 | 11.0 | 690.97 | 6.5 | 4130.0 | 4130.0 | 7.0 | N | 0.0 | 2.0 | 12.0 | N | 11.5 | N | 6.5 | Medium Value | 10 | SQDKMS | 2019-10-10 | 2.382890e+09 | 2019-10-10 | 2019-12-24 | 75 | 3 | 12 | BOS | PHL | BOSPHL | US | US | SPOKE | SPOKE | NE-H2H ... | D | 280 | None | None | Y | Y | NONE | 0 | B | Basic Economy | R | N | L | 1 | 85.83 | 0 | 0.00 | 0 | 0.00 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 |
| 4 | 2.362382e+11 | 27YK4X2 | 1 | G | 1 | 38 | F | 77077 | 28.5 | 4.0 | 11221.60 | 59.0 | 125785.5 | 112285.5 | 59.0 | N | 1.0 | 2.0 | 51.0 | N | 64.0 | Y | 45.0 | High Value | 5 | XUQCRL | 2019-05-20 | 7.362932e+09 | 2019-05-21 | 2019-06-07 | 17 | 6 | 6 | PHL | IAH | IAHPHL | US | US | MAJOR HUB | PHL | NWC-TEXAS ... | D | 1324 | None | None | Y | Y | NONE | 0 | S | Economy | R | N | B | 1 | 136.74 | 0 | 0.00 | 0 | 0.00 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 | 0.0 | 0 |
#Data Type
df.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 1000 entries, 0 to 999 Data columns (total 73 columns): PARTY_ID 1000 non-null float64 LYLTY_ACCT_ID 1000 non-null object loyalty_Ind 1000 non-null int64 LYLTY_LEVEL_CD 1000 non-null object MACRO_SEG_ID 1000 non-null int64 INDVDL_AGE_QTY 1000 non-null int64 GENDER_CD 1000 non-null object POSTAL_CD 1000 non-null object BKGS_QTY 1000 non-null float64 AADVAN_MBR_TTL_YEARS_QTY 1000 non-null float64 AA_OPER_REVNUE_SEG_AMT 1000 non-null float64 AA_OPER_REVNUE_SEG_QTY 1000 non-null float64 ACRU_AEM_QTY 1000 non-null float64 ACRU_FLIGHT_AEM_QTY 1000 non-null float64 ACRU_FLIGHT_SEG_QTY 1000 non-null float64 BARCLAYS_CARD_HOLDER_IND 1000 non-null object BKGS_INTL_QTY 1000 non-null float64 BKGS_PAX_QTY_AVG 1000 non-null float64 CHECK_BAGS_PER_TRIP_AVG 1000 non-null float64 CITIBNK_PREMIM_CARD_HLDR_IND 1000 non-null object FLOWN_SEG_QTY 1000 non-null float64 LOUNGE_MBR_PRTCPNT_IND 1000 non-null object TRIP_QTY 1000 non-null float64 aa_value_code 1000 non-null object booking_month 1000 non-null int64 PNR_LOCTR_ID 1000 non-null object PNR_CREATE_DT 1000 non-null object TICKET_NBR 1000 non-null float64 TICKET_ISSUE_DT 1000 non-null object SEG_DEP_DT 1000 non-null object days_before_dept 1000 non-null int64 day_of_week_dep 1000 non-null int64 dep_month 1000 non-null int64 SEG_DEP_AIRPRT_IATA_CD 1000 non-null object SEG_ARVL_AIRPRT_IATA_CD 1000 non-null object MKT_N_DIRECTN_AIRPRT_PAIR_CD 1000 non-null object DEP_CNTRY_CD 1000 non-null object ARVL_CNTRY_CD 1000 non-null object AIRPRT_IATA_GRP 1000 non-null object AIRPRT_IATA_TOP 1000 non-null object TACTL_ENTITY 1000 non-null object SEG_INTRVL_TYPE 1000 non-null object MILE_GREAT_CIRCLE_DISTANC_QTY 1000 non-null int64 SEG_GEOG_AREA_CD 218 non-null object SEG_GEOG_AREA_DESC 218 non-null object CABIN_BOOKED 1000 non-null object CABIN_FLOWN 1000 non-null object UPG_TYPE 1000 non-null object upgrade_ind 1000 non-null int64 ACCT_FARE_CLASS_CD 1000 non-null object ACCT_FARE_CLASS_TYPE 1000 non-null object TKT_TYPE 1000 non-null object BX_IND 1000 non-null object BKG_BUSINES_LEISR_IND 1000 non-null object Spend_Ind 1000 non-null int64 FLIGHT_REV 1000 non-null float64 ancillary_spend_ind 1000 non-null int64 ancll_rev_ttl 1000 non-null float64 ancll_qty_ttl 1000 non-null int64 rev_pref_seats 1000 non-null float64 qty_pref_seats 1000 non-null int64 rev_bag 1000 non-null float64 qty_bag 1000 non-null int64 rev_bag_non_cat 1000 non-null float64 qty_bag_non_cat 1000 non-null int64 rev_mileage_multiplier 1000 non-null float64 qty_mileage_multiplier 1000 non-null int64 rev_tsa 1000 non-null float64 qty_tsa 1000 non-null int64 rev_upgrade 1000 non-null float64 qty_upgrade 1000 non-null int64 rev_other_non_cat 1000 non-null float64 qty_other_non_cat 1000 non-null int64 dtypes: float64(23), int64(19), object(31) memory usage: 570.4+ KB
# Unique values
df.nunique()
PARTY_ID 1000 LYLTY_ACCT_ID 1000 loyalty_Ind 1 LYLTY_LEVEL_CD 6 MACRO_SEG_ID 11 INDVDL_AGE_QTY 76 GENDER_CD 4 POSTAL_CD 875 BKGS_QTY 69 AADVAN_MBR_TTL_YEARS_QTY 39 AA_OPER_REVNUE_SEG_AMT 832 AA_OPER_REVNUE_SEG_QTY 103 ACRU_AEM_QTY 857 ACRU_FLIGHT_AEM_QTY 802 ACRU_FLIGHT_SEG_QTY 68 BARCLAYS_CARD_HOLDER_IND 2 BKGS_INTL_QTY 23 BKGS_PAX_QTY_AVG 9 CHECK_BAGS_PER_TRIP_AVG 63 CITIBNK_PREMIM_CARD_HLDR_IND 2 FLOWN_SEG_QTY 110 LOUNGE_MBR_PRTCPNT_IND 2 TRIP_QTY 90 aa_value_code 3 booking_month 12 PNR_LOCTR_ID 1000 PNR_CREATE_DT 373 TICKET_NBR 1000 TICKET_ISSUE_DT 376 SEG_DEP_DT 333 days_before_dept 160 day_of_week_dep 7 dep_month 12 SEG_DEP_AIRPRT_IATA_CD 181 SEG_ARVL_AIRPRT_IATA_CD 156 MKT_N_DIRECTN_AIRPRT_PAIR_CD 510 DEP_CNTRY_CD 5 ARVL_CNTRY_CD 5 AIRPRT_IATA_GRP 6 AIRPRT_IATA_TOP 25 TACTL_ENTITY 41 SEG_INTRVL_TYPE 3 MILE_GREAT_CIRCLE_DISTANC_QTY 455 SEG_GEOG_AREA_CD 10 SEG_GEOG_AREA_DESC 10 CABIN_BOOKED 3 CABIN_FLOWN 3 UPG_TYPE 7 upgrade_ind 2 ACCT_FARE_CLASS_CD 20 ACCT_FARE_CLASS_TYPE 4 TKT_TYPE 2 BX_IND 2 BKG_BUSINES_LEISR_IND 2 Spend_Ind 2 FLIGHT_REV 817 ancillary_spend_ind 2 ancll_rev_ttl 89 ancll_qty_ttl 5 rev_pref_seats 70 qty_pref_seats 3 rev_bag 1 qty_bag 1 rev_bag_non_cat 12 qty_bag_non_cat 5 rev_mileage_multiplier 2 qty_mileage_multiplier 2 rev_tsa 2 qty_tsa 2 rev_upgrade 3 qty_upgrade 2 rev_other_non_cat 2 qty_other_non_cat 2 dtype: int64
# Scientific Notation to long format
pd.set_option('display.float_format', lambda x: '%.2f' % x)
# Data Statistics
df.describe().round().T
| count | mean | std | min | 25% | 50% | 75% | max | |
|---|---|---|---|---|---|---|---|---|
| PARTY_ID | 1000.00 | 106372368385.00 | 93520325819.00 | 20288584.00 | 10547375501.00 | 117226425201.00 | 170561838801.00 | 282263008001.00 |
| loyalty_Ind | 1000.00 | 1.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 |
| MACRO_SEG_ID | 1000.00 | 6.00 | 4.00 | 1.00 | 3.00 | 5.00 | 10.00 | 11.00 |
| INDVDL_AGE_QTY | 1000.00 | 49.00 | 15.00 | 5.00 | 37.00 | 50.00 | 60.00 | 91.00 |
| BKGS_QTY | 1000.00 | 5.00 | 8.00 | 0.00 | 0.00 | 2.00 | 6.00 | 72.00 |
| AADVAN_MBR_TTL_YEARS_QTY | 1000.00 | 15.00 | 10.00 | 2.00 | 5.00 | 12.00 | 23.00 | 40.00 |
| AA_OPER_REVNUE_SEG_AMT | 1000.00 | 1757.00 | 3000.00 | 0.00 | 160.00 | 648.00 | 2053.00 | 29922.00 |
| AA_OPER_REVNUE_SEG_QTY | 1000.00 | 9.00 | 14.00 | 0.00 | 1.00 | 4.00 | 11.00 | 122.00 |
| ACRU_AEM_QTY | 1000.00 | 34297.00 | 63263.00 | 0.00 | 1326.00 | 7224.00 | 40590.00 | 553949.00 |
| ACRU_FLIGHT_AEM_QTY | 1000.00 | 15906.00 | 32566.00 | 0.00 | 774.00 | 3688.00 | 13632.00 | 336215.00 |
| ACRU_FLIGHT_SEG_QTY | 1000.00 | 9.00 | 14.00 | 0.00 | 1.00 | 4.00 | 11.00 | 121.00 |
| BKGS_INTL_QTY | 1000.00 | 1.00 | 3.00 | 0.00 | 0.00 | 0.00 | 0.00 | 68.00 |
| BKGS_PAX_QTY_AVG | 1000.00 | 2.00 | 1.00 | 1.00 | 1.00 | 2.00 | 2.00 | 31.00 |
| CHECK_BAGS_PER_TRIP_AVG | 1000.00 | 8.00 | 14.00 | 0.00 | 0.00 | 3.00 | 8.00 | 184.00 |
| FLOWN_SEG_QTY | 1000.00 | 10.00 | 15.00 | 0.00 | 2.00 | 5.00 | 13.00 | 126.00 |
| TRIP_QTY | 1000.00 | 7.00 | 11.00 | 0.00 | 1.00 | 4.00 | 9.00 | 88.00 |
| booking_month | 1000.00 | 6.00 | 3.00 | 1.00 | 3.00 | 6.00 | 9.00 | 12.00 |
| TICKET_NBR | 1000.00 | 4321811223.00 | 2438655194.00 | 2173755731.00 | 2350600725.00 | 2380559074.00 | 7315702002.00 | 9511280750.00 |
| days_before_dept | 1000.00 | 37.00 | 51.00 | 0.00 | 7.00 | 20.00 | 46.00 | 331.00 |
| day_of_week_dep | 1000.00 | 4.00 | 2.00 | 1.00 | 2.00 | 4.00 | 6.00 | 7.00 |
| dep_month | 1000.00 | 7.00 | 3.00 | 1.00 | 4.00 | 7.00 | 9.00 | 12.00 |
| MILE_GREAT_CIRCLE_DISTANC_QTY | 1000.00 | 1078.00 | 1060.00 | 67.00 | 408.00 | 802.00 | 1271.00 | 7488.00 |
| upgrade_ind | 1000.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| Spend_Ind | 1000.00 | 1.00 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 |
| FLIGHT_REV | 1000.00 | 181.00 | 222.00 | 0.00 | 75.00 | 134.00 | 222.00 | 3122.00 |
| ancillary_spend_ind | 1000.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| ancll_rev_ttl | 1000.00 | 7.00 | 19.00 | 0.00 | 0.00 | 0.00 | 0.00 | 125.00 |
| ancll_qty_ttl | 1000.00 | 0.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 5.00 |
| rev_pref_seats | 1000.00 | 3.00 | 14.00 | 0.00 | 0.00 | 0.00 | 0.00 | 118.00 |
| qty_pref_seats | 1000.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 2.00 |
| rev_bag | 1000.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| qty_bag | 1000.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| rev_bag_non_cat | 1000.00 | 3.00 | 11.00 | 0.00 | 0.00 | 0.00 | 0.00 | 125.00 |
| qty_bag_non_cat | 1000.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 5.00 |
| rev_mileage_multiplier | 1000.00 | 0.00 | 2.00 | 0.00 | 0.00 | 0.00 | 0.00 | 52.00 |
| qty_mileage_multiplier | 1000.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| rev_tsa | 1000.00 | 0.00 | 1.00 | 0.00 | 0.00 | 0.00 | 0.00 | 35.00 |
| qty_tsa | 1000.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| rev_upgrade | 1000.00 | 0.00 | 3.00 | 0.00 | 0.00 | 0.00 | 0.00 | 86.00 |
| qty_upgrade | 1000.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
| rev_other_non_cat | 1000.00 | 0.00 | 5.00 | 0.00 | 0.00 | 0.00 | 0.00 | 75.00 |
| qty_other_non_cat | 1000.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 |
#statistical description of object values
df.describe(include=['object'])
| LYLTY_ACCT_ID | LYLTY_LEVEL_CD | GENDER_CD | POSTAL_CD | BARCLAYS_CARD_HOLDER_IND | CITIBNK_PREMIM_CARD_HLDR_IND | LOUNGE_MBR_PRTCPNT_IND | aa_value_code | PNR_LOCTR_ID | PNR_CREATE_DT | TICKET_ISSUE_DT | SEG_DEP_DT | SEG_DEP_AIRPRT_IATA_CD | SEG_ARVL_AIRPRT_IATA_CD | MKT_N_DIRECTN_AIRPRT_PAIR_CD | DEP_CNTRY_CD | ARVL_CNTRY_CD | AIRPRT_IATA_GRP | AIRPRT_IATA_TOP | TACTL_ENTITY | SEG_INTRVL_TYPE | SEG_GEOG_AREA_CD | SEG_GEOG_AREA_DESC | CABIN_BOOKED | CABIN_FLOWN | UPG_TYPE | ACCT_FARE_CLASS_CD | ACCT_FARE_CLASS_TYPE | TKT_TYPE | BX_IND | BKG_BUSINES_LEISR_IND | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 218 | 218 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 | 1000 |
| unique | 1000 | 6 | 4 | 875 | 2 | 2 | 2 | 3 | 1000 | 373 | 376 | 333 | 181 | 156 | 510 | 5 | 5 | 6 | 25 | 41 | 3 | 10 | 10 | 3 | 3 | 7 | 20 | 4 | 2 | 2 | 2 |
| top | 0M0XM44 | R | M | 76034 | N | N | N | Medium Value | KQICJB | 2019-02-05 | 2019-04-08 | 2019-01-27 | DFW | DFW | DFWORD | US | US | MAJOR HUB | SPOKE | S-SOUTH ... | D | D | US48 | Y | Y | NONE | G | Economy | R | N | B |
| freq | 1 | 619 | 608 | 6 | 877 | 787 | 901 | 425 | 1 | 9 | 8 | 9 | 162 | 166 | 11 | 929 | 944 | 409 | 332 | 80 | 873 | 189 | 189 | 909 | 819 | 867 | 127 | 878 | 903 | 910 | 511 |
# Getting mean null values in each column
df.isnull().sum().sort_values(ascending=True)
PARTY_ID 0 BX_IND 0 TKT_TYPE 0 ACCT_FARE_CLASS_TYPE 0 ACCT_FARE_CLASS_CD 0 upgrade_ind 0 UPG_TYPE 0 BKG_BUSINES_LEISR_IND 0 CABIN_FLOWN 0 MILE_GREAT_CIRCLE_DISTANC_QTY 0 SEG_INTRVL_TYPE 0 TACTL_ENTITY 0 AIRPRT_IATA_TOP 0 AIRPRT_IATA_GRP 0 ARVL_CNTRY_CD 0 CABIN_BOOKED 0 rev_other_non_cat 0 Spend_Ind 0 ancillary_spend_ind 0 qty_upgrade 0 rev_upgrade 0 qty_tsa 0 rev_tsa 0 qty_mileage_multiplier 0 rev_mileage_multiplier 0 FLIGHT_REV 0 qty_bag_non_cat 0 qty_bag 0 rev_bag 0 qty_pref_seats 0 rev_pref_seats 0 ancll_qty_ttl 0 ancll_rev_ttl 0 rev_bag_non_cat 0 MKT_N_DIRECTN_AIRPRT_PAIR_CD 0 DEP_CNTRY_CD 0 SEG_DEP_AIRPRT_IATA_CD 0 BARCLAYS_CARD_HOLDER_IND 0 ACRU_FLIGHT_SEG_QTY 0 ACRU_FLIGHT_AEM_QTY 0 ACRU_AEM_QTY 0 AA_OPER_REVNUE_SEG_QTY 0 AA_OPER_REVNUE_SEG_AMT 0 AADVAN_MBR_TTL_YEARS_QTY 0 BKGS_QTY 0 POSTAL_CD 0 GENDER_CD 0 INDVDL_AGE_QTY 0 MACRO_SEG_ID 0 LYLTY_LEVEL_CD 0 loyalty_Ind 0 LYLTY_ACCT_ID 0 BKGS_INTL_QTY 0 SEG_ARVL_AIRPRT_IATA_CD 0 BKGS_PAX_QTY_AVG 0 CITIBNK_PREMIM_CARD_HLDR_IND 0 dep_month 0 day_of_week_dep 0 days_before_dept 0 SEG_DEP_DT 0 TICKET_ISSUE_DT 0 TICKET_NBR 0 CHECK_BAGS_PER_TRIP_AVG 0 qty_other_non_cat 0 PNR_LOCTR_ID 0 booking_month 0 aa_value_code 0 TRIP_QTY 0 LOUNGE_MBR_PRTCPNT_IND 0 FLOWN_SEG_QTY 0 PNR_CREATE_DT 0 SEG_GEOG_AREA_CD 782 SEG_GEOG_AREA_DESC 782 dtype: int64
#Data Shape
print('Total Rows',df.shape[0],'Total Features',df.shape[1])
Total Rows 1000 Total Features 73
# Unique Values for Categgorical Columns
print("Loyalty Level Codes:")
print(pd.unique(df['LYLTY_LEVEL_CD']))
print("---------------------")
print("Macro Segment IDs:")
print(pd.unique(df['MACRO_SEG_ID']))
print("---------------------")
print("Gender Codes:")
print(pd.unique(df['GENDER_CD']))
print("---------------------")
print("Segment Types:")
print(pd.unique(df['SEG_INTRVL_TYPE']))
print("---------------------")
print("Cabin Booked:")
print(pd.unique(df['CABIN_BOOKED']))
print("---------------------")
print("Cabin Flown:")
print(pd.unique(df['CABIN_FLOWN']))
print("---------------------")
print("Upgrade Type:")
print(pd.unique(df['UPG_TYPE']))
Loyalty Level Codes: ['G ' 'R ' 'P ' 'T ' 'E ' 'C '] --------------------- Macro Segment IDs: [ 1 3 4 10 9 11 2 8 6 5 7] --------------------- Gender Codes: ['M ' 'F ' 'U ' 'X '] --------------------- Segment Types: ['D' 'IL' 'IS'] --------------------- Cabin Booked: ['Y' 'C' 'W'] --------------------- Cabin Flown: ['Y ' 'C ' 'W '] --------------------- Upgrade Type: ['NONE' 'COMP' 'UPFAR' 'UPAWD' 'E500' 'EVIP' 'LFBU']
# Create Target Variable
df['Total_Revenue']= df['FLIGHT_REV'] + df['ancll_rev_ttl']
# Convert to dinamic once we have access to birth date
bins = [0,23, 39, 55, 74, 150]
labels = ['Z','Millennial', 'X', 'Boomers','Silent']
df['age_cat'] = pd.cut(df['INDVDL_AGE_QTY'], bins=bins, labels=labels, right=False)
# Dropped rows that were missing age info
df.dropna(subset = ['INDVDL_AGE_QTY'], inplace=True)
# Replace null values
df["BKGS_PAX_QTY_AVG"].fillna(1, inplace = True)
# Replace null values
df['UPG_TYPE'].fillna("NONE", inplace = True)
df.drop(['SEG_GEOG_AREA_CD','SEG_GEOG_AREA_DESC'], axis = 1,inplace = True)
# Replace null values
df["BKG_BUSINES_LEISR_IND"].fillna("L", inplace = True)
# Dropped rows that were missing age info
df.dropna(subset = ['age_cat'], inplace=True)
# Dropped rows that were missing age info
df.dropna(subset = ['GENDER_CD'], inplace=True)
# Dropped rows that were missing age info
df.dropna(subset = ['FLIGHT_REV'], inplace=True)
# Replace null values
df["TACTL_ENTITY"].fillna("None", inplace = True)
# Replace null values
df["POSTAL_CD"].fillna("None", inplace = True)
# Removing Whitespace
df["BKG_BUSINES_LEISR_IND"] = df["BKG_BUSINES_LEISR_IND"].str.strip()
df["SEG_ARVL_AIRPRT_IATA_CD"] = df["SEG_ARVL_AIRPRT_IATA_CD"].str.strip()
df["SEG_DEP_AIRPRT_IATA_CD"] = df["SEG_DEP_AIRPRT_IATA_CD"].str.strip()
df["CABIN_BOOKED"] = df["CABIN_BOOKED"].str.strip()
df["CABIN_FLOWN"] = df["CABIN_FLOWN"].str.strip()
df["DEP_CNTRY_CD"] = df["DEP_CNTRY_CD"].str.strip()
df["ARVL_CNTRY_CD"] = df["ARVL_CNTRY_CD"].str.strip()
df["POSTAL_CD"] = df["POSTAL_CD"].str.strip()
# Convert datetime
df['SEG_DEP_DT'] = pd.to_datetime(df['SEG_DEP_DT'])
df['TICKET_ISSUE_DT'] = pd.to_datetime(df['TICKET_ISSUE_DT'])
df['PNR_CREATE_DT'] = pd.to_datetime(df['PNR_CREATE_DT'])
#convert float to int
df["MACRO_SEG_ID"]=df["MACRO_SEG_ID"].astype(str)
df["dep_month"]=df["dep_month"].astype(str)
df["day_of_week_dep"]=df["day_of_week_dep"].astype(str)
df["PARTY_ID"]=df["PARTY_ID"].astype(int)
df["INDVDL_AGE_QTY"]=df["INDVDL_AGE_QTY"].astype(int)
df["BKGS_PAX_QTY_AVG"]=df["BKGS_PAX_QTY_AVG"].astype(int)
# Feature creation
#Join data with the lat/lon info
df_l=pd.read_csv('All.csv')
df_join=pd.merge(df_l, df, on='SEG_DEP_AIRPRT_IATA_CD', how='inner')
# Creating arrival city and departure city
#Creating new columns to have Coordinates
df_join.loc[:, 'city_arr'] = df_join["SEG_ARVL_AIRPRT_IATA_CD"].copy()
df_join.loc[:, 'city_dep'] = df_join["City"].copy()
#Seperating dfs
df_city = df_join[['SEG_DEP_AIRPRT_IATA_CD', 'City']]
df_city = df_join.set_index('SEG_DEP_AIRPRT_IATA_CD')['City']
# Creating Dictionary
citi_dict = df_city.T.to_dict()
# Replacing Dictionary with values
df_join['city_arr']=df_join['city_arr'].map(citi_dict)
#Creating new columns to have Coordinates
df_join.loc[:, 'location_lat_dep'] = df_join["SEG_DEP_AIRPRT_IATA_CD"].copy()
df_join.loc[:, 'location_long_dep'] = df_join["SEG_DEP_AIRPRT_IATA_CD"].copy()
df_join.loc[:, 'location_lat_arr'] = df_join["SEG_ARVL_AIRPRT_IATA_CD"].copy()
df_join.loc[:, 'location_long_arr'] =df_join["SEG_ARVL_AIRPRT_IATA_CD"].copy()
#Seperating lat and long dfs
df_lat = df_l[['SEG_DEP_AIRPRT_IATA_CD', 'Lat']]
df_lat = df_lat.set_index('SEG_DEP_AIRPRT_IATA_CD')['Lat']
df_lon = df_l[['SEG_DEP_AIRPRT_IATA_CD', 'lon']]
df_lon = df_lon.set_index('SEG_DEP_AIRPRT_IATA_CD')['lon']
# Creating Dictionary
lat_dict = df_lat.T.to_dict()
lon_dict = df_lon.T.to_dict()
# Replacing Dictionary with values
df_join['location_lat_dep']=df_join['location_lat_dep'].map(lat_dict)
df_join['location_long_dep']=df_join['location_long_dep'].map(lon_dict)
df_join['location_lat_arr']=df_join['location_lat_arr'].map(lat_dict)
df_join['location_long_arr']=df_join['location_long_arr'].map(lon_dict)
df=df_join
#dataframe
#International Flights
overseas_df = df[(df["DEP_CNTRY_CD"] != df["ARVL_CNTRY_CD"])]
#print("International Travel Dataset Size After Filtering : ", overseas_df.shape)
#Domestic Flights
US_df = df[df["DEP_CNTRY_CD"] == df["ARVL_CNTRY_CD"]]
#print("International Travel Dataset Size After Filtering : ", US_df.shape)
#Flight column indicating international or domestic flight
df.loc[(df["DEP_CNTRY_CD"] != df["ARVL_CNTRY_CD"]), 'Flight'] = 'International'
df.loc[df["DEP_CNTRY_CD"] == df["ARVL_CNTRY_CD"], 'Flight'] = 'Domestic'
# Data Statistics on Age
df['INDVDL_AGE_QTY'].describe()
count 1000.00 mean 48.80 std 15.00 min 5.00 25% 37.00 50% 50.00 75% 60.00 max 91.00 Name: INDVDL_AGE_QTY, dtype: float64
# Data Statistics on Age Category
df['age_cat'].value_counts()
Boomers 342 X 331 Millennial 249 Silent 42 Z 36 Name: age_cat, dtype: int64
# Getting mean null values in each column
df.isnull().sum().sort_values(ascending=True)
Unnamed: 0 0 ancillary_spend_ind 0 FLIGHT_REV 0 Spend_Ind 0 BKG_BUSINES_LEISR_IND 0 BX_IND 0 TKT_TYPE 0 ACCT_FARE_CLASS_TYPE 0 ACCT_FARE_CLASS_CD 0 upgrade_ind 0 UPG_TYPE 0 CABIN_FLOWN 0 CABIN_BOOKED 0 MILE_GREAT_CIRCLE_DISTANC_QTY 0 SEG_INTRVL_TYPE 0 TACTL_ENTITY 0 AIRPRT_IATA_TOP 0 AIRPRT_IATA_GRP 0 ancll_rev_ttl 0 ancll_qty_ttl 0 rev_pref_seats 0 qty_pref_seats 0 location_lat_arr 0 location_long_dep 0 location_lat_dep 0 city_dep 0 age_cat 0 Total_Revenue 0 qty_other_non_cat 0 rev_other_non_cat 0 ARVL_CNTRY_CD 0 qty_upgrade 0 qty_tsa 0 rev_tsa 0 qty_mileage_multiplier 0 rev_mileage_multiplier 0 qty_bag_non_cat 0 rev_bag_non_cat 0 qty_bag 0 rev_bag 0 rev_upgrade 0 location_long_arr 0 DEP_CNTRY_CD 0 SEG_ARVL_AIRPRT_IATA_CD 0 AA_OPER_REVNUE_SEG_AMT 0 AADVAN_MBR_TTL_YEARS_QTY 0 BKGS_QTY 0 POSTAL_CD 0 GENDER_CD 0 INDVDL_AGE_QTY 0 MACRO_SEG_ID 0 LYLTY_LEVEL_CD 0 loyalty_Ind 0 LYLTY_ACCT_ID 0 PARTY_ID 0 lon 0 Lat 0 SEG_DEP_AIRPRT_IATA_CD 0 Country 0 City 0 Airport Name 0 AA_OPER_REVNUE_SEG_QTY 0 MKT_N_DIRECTN_AIRPRT_PAIR_CD 0 ACRU_AEM_QTY 0 ACRU_FLIGHT_SEG_QTY 0 dep_month 0 day_of_week_dep 0 days_before_dept 0 SEG_DEP_DT 0 TICKET_ISSUE_DT 0 TICKET_NBR 0 PNR_CREATE_DT 0 PNR_LOCTR_ID 0 ACRU_FLIGHT_AEM_QTY 0 booking_month 0 TRIP_QTY 0 LOUNGE_MBR_PRTCPNT_IND 0 FLOWN_SEG_QTY 0 CITIBNK_PREMIM_CARD_HLDR_IND 0 CHECK_BAGS_PER_TRIP_AVG 0 BKGS_PAX_QTY_AVG 0 BKGS_INTL_QTY 0 BARCLAYS_CARD_HOLDER_IND 0 aa_value_code 0 Flight 0 city_arr 74 dtype: int64
# Removing all warnings
import warnings
warnings.filterwarnings("ignore")
#Setting graph color themes
sns.set_palette(sns.color_palette("Set3", 10))
# histogram of total revenue distribution
ax = sns.distplot(df.Total_Revenue, hist=True, hist_kws={"edgecolor": 'w', "linewidth": 3}, kde_kws={"linewidth": 3}, bins=[0, 50, 100, 150, 200, 250, 300, 350, 400, 450])
# ticks
plt.xticks(fontsize=9)
plt.yticks(fontsize=9)
# labels and title
plt.xlabel('Total_Revenue', fontsize=9)
plt.ylabel('frequency', fontsize=9)
plt.title('Distribution of Revenue', fontsize=18);
#creating sub dataframe where there is an spend indication
df_nonnullrev = df.loc[df['Spend_Ind'] == 1].copy()
# histogram of total revenue distribution when there is spend
ax = sns.distplot(df_nonnullrev.Total_Revenue, hist=True, hist_kws={"edgecolor": 'w', "linewidth": 3}, kde_kws={"linewidth": 3}, bins=[0, 50, 100, 150, 200, 250, 300, 350, 400, 450])
# ticks
plt.xticks(fontsize=9)
plt.yticks(fontsize=9)
# labels and title
plt.xlabel('Total_Revenue', fontsize=9)
plt.ylabel('frequency', fontsize=9)
plt.title('Distribution of Revenue', fontsize=18);
# box plot of the variable Total_Revenue
ax = sns.boxplot(df.Total_Revenue)
# xtick, label, and title
plt.xticks(fontsize=9)
plt.xlabel('Total Revenue', fontsize=9)
plt.title('Distribution of Total Revenue', fontsize=18)
Text(0.5, 1.0, 'Distribution of Total Revenue')
# box plot of the variable Total_Revenue when there is spend
ax = sns.boxplot(df_nonnullrev.Total_Revenue)
# xtick, label, and title
plt.xticks(fontsize=9)
plt.xlabel('Total Revenue', fontsize=9)
plt.title('Distribution of Total Revenue', fontsize=18)
Text(0.5, 1.0, 'Distribution of Total Revenue')
# Boxplot for numerical features and their distrbution
study_set = ['Total_Revenue','FLIGHT_REV','ancll_rev_ttl','BKGS_QTY','AA_OPER_REVNUE_SEG_AMT',
'AA_OPER_REVNUE_SEG_QTY','ACRU_AEM_QTY','ACRU_FLIGHT_AEM_QTY','ACRU_FLIGHT_SEG_QTY',
'CHECK_BAGS_PER_TRIP_AVG','FLOWN_SEG_QTY','TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(study_set):
plt.subplot(8,5,i+1)
sns.boxplot(y= df[variable], data=df)
plt.tight_layout()
plt.title(variable)
# Barplot for numerical features and their distrbution
study_set = ['Total_Revenue','FLIGHT_REV','ancll_rev_ttl','BKGS_QTY','AA_OPER_REVNUE_SEG_AMT',
'AA_OPER_REVNUE_SEG_QTY','ACRU_AEM_QTY','ACRU_FLIGHT_AEM_QTY','ACRU_FLIGHT_SEG_QTY',
'CHECK_BAGS_PER_TRIP_AVG','FLOWN_SEG_QTY','TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(study_set):
plt.subplot(8,5,i+1)
sns.distplot(df[variable])
plt.tight_layout()
plt.title(variable)
#creating new dataframe for log analysis on numerical values
df_log = df.copy()
#Adding one to all numerical values for analysing data greater than 0
df_log['log_Total_Revenue'] = np.log(df_log['Total_Revenue']+1)
df_log['log_FLIGHT_REV'] = np.log(df_log['FLIGHT_REV']+1)
df_log['log_ancll_rev_ttl'] = np.log(df_log['ancll_rev_ttl']+1)
df_log['log_BKGS_QTY'] = np.log(df_log['BKGS_QTY']+1)
df_log['log_AA_OPER_REVNUE_SEG_AMT'] = np.log(df_log['AA_OPER_REVNUE_SEG_AMT']+1)
df_log['log_AA_OPER_REVNUE_SEG_QTY'] = np.log(df_log['AA_OPER_REVNUE_SEG_QTY']+1)
df_log['log_ACRU_AEM_QTY'] = np.log(df_log['BKGS_QTY']+1)
df_log['log_ACRU_FLIGHT_AEM_QTY'] = np.log(df_log['AA_OPER_REVNUE_SEG_AMT']+1)
df_log['log_ACRU_FLIGHT_SEG_QTY'] = np.log(df_log['AA_OPER_REVNUE_SEG_QTY']+1)
df_log['log_CHECK_BAGS_PER_TRIP_AVG'] = np.log(df_log['CHECK_BAGS_PER_TRIP_AVG']+1)
df_log['log_FLOWN_SEG_QTY'] = np.log(df_log['FLOWN_SEG_QTY']+1)
df_log['log_TRIP_QTY'] = np.log(df_log['TRIP_QTY']+1)
# Baxplot for log numerical features and their distrbution
log_study_set = ['log_Total_Revenue','log_FLIGHT_REV','log_ancll_rev_ttl','log_BKGS_QTY','log_AA_OPER_REVNUE_SEG_AMT',
'log_AA_OPER_REVNUE_SEG_QTY','log_ACRU_AEM_QTY','log_ACRU_FLIGHT_AEM_QTY','log_ACRU_FLIGHT_SEG_QTY',
'log_CHECK_BAGS_PER_TRIP_AVG','log_FLOWN_SEG_QTY','log_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(log_study_set):
plt.subplot(8,5,i+1)
sns.boxplot(y= df_log[variable], data=df_log)
plt.tight_layout()
plt.title(variable)
# lineplot for numerical features and their distrbution
log_study_set = ['log_Total_Revenue','log_FLIGHT_REV','log_ancll_rev_ttl','log_BKGS_QTY','log_AA_OPER_REVNUE_SEG_AMT',
'log_AA_OPER_REVNUE_SEG_QTY','log_ACRU_AEM_QTY','log_ACRU_FLIGHT_AEM_QTY','log_ACRU_FLIGHT_SEG_QTY',
'log_CHECK_BAGS_PER_TRIP_AVG','log_FLOWN_SEG_QTY','log_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
a= np.arange(0, 4, 40, dtype=None)
for i, variable in enumerate(log_study_set):
plt.subplot(8,5,i+1)
sns.distplot(df_log[variable],bins=a)
plt.tight_layout()
plt.title(variable)
# measuring skewness of numeric values
numericColumns = ['Total_Revenue','FLIGHT_REV','ancll_rev_ttl','BKGS_QTY','AA_OPER_REVNUE_SEG_AMT',
'AA_OPER_REVNUE_SEG_QTY','ACRU_AEM_QTY','ACRU_FLIGHT_AEM_QTY','ACRU_FLIGHT_SEG_QTY',
'CHECK_BAGS_PER_TRIP_AVG','FLOWN_SEG_QTY','TRIP_QTY']
skewColumns = df[numericColumns].apply(lambda x: skew(x)).sort_values(ascending=False)
#Listing high skew (greater than 0.5)
highSkew = skewColumns[skewColumns > 0.5]
print(highSkew)
FLIGHT_REV 6.30 Total_Revenue 6.16 CHECK_BAGS_PER_TRIP_AVG 4.97 ACRU_FLIGHT_AEM_QTY 4.16 AA_OPER_REVNUE_SEG_AMT 3.82 AA_OPER_REVNUE_SEG_QTY 3.57 ACRU_FLIGHT_SEG_QTY 3.52 ACRU_AEM_QTY 3.51 BKGS_QTY 3.45 FLOWN_SEG_QTY 3.38 ancll_rev_ttl 3.22 TRIP_QTY 3.18 dtype: float64
# HighSkew index
highSkew.index
Index(['FLIGHT_REV', 'Total_Revenue', 'CHECK_BAGS_PER_TRIP_AVG',
'ACRU_FLIGHT_AEM_QTY', 'AA_OPER_REVNUE_SEG_AMT',
'AA_OPER_REVNUE_SEG_QTY', 'ACRU_FLIGHT_SEG_QTY', 'ACRU_AEM_QTY',
'BKGS_QTY', 'FLOWN_SEG_QTY', 'ancll_rev_ttl', 'TRIP_QTY'],
dtype='object')
# Creating new dataframe for boxcox visualization
df_boxcox = df.copy()
#Creating a boxcox variable for all variables with high skewness
for i in highSkew.index:
df_boxcox[f"boxcox_{i}"] = boxcox1p(df_boxcox[i],boxcox_normmax(df_boxcox[i] + 1))
# show total revenue distrbution
f= plt.figure(figsize=(12,4))
sns.distplot(df['Total_Revenue'],bins=[0,100,200,300,400,500,600,700,800],color='r')
ax.set_title('Distribution of Total Revenue')
Text(0.5, 1.0, 'Distribution of Total Revenue')
# total revenue distrbution on log total revenue
f= plt.figure(figsize=(12,4))
sns.distplot(df_log['log_Total_Revenue'],bins=[0,2,4,6,8,10,12,14,16,18,20],color='r')
ax.set_title('Distribution of log scale Total Revenue')
Text(0.5, 1.0, 'Distribution of log scale Total Revenue')
# distribution of total revenue box cox transformed
f= plt.figure(figsize=(12,4))
sns.distplot(df_boxcox['Total_Revenue'],bins=[0,100,200,300,400,500,600,700],color='r')
ax.set_title('Distribution of Total Revenue boxcox transformed')
Text(0.5, 1.0, 'Distribution of Total Revenue boxcox transformed')
#boxplot on boxcox dataframe for numerical data
boxcox_study_set = ['Total_Revenue','FLIGHT_REV','boxcox_ancll_rev_ttl','boxcox_BKGS_QTY','boxcox_AA_OPER_REVNUE_SEG_AMT',
'boxcox_AA_OPER_REVNUE_SEG_QTY','boxcox_ACRU_AEM_QTY','boxcox_ACRU_FLIGHT_AEM_QTY','boxcox_ACRU_FLIGHT_SEG_QTY',
'boxcox_CHECK_BAGS_PER_TRIP_AVG','boxcox_FLOWN_SEG_QTY','boxcox_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(boxcox_study_set):
plt.subplot(8,5,i+1)
sns.boxplot(y= df_boxcox[variable], data=df_boxcox)
plt.tight_layout()
plt.title(variable)
#lineplot on boxcox dataframe for numerical data
boxcox_study_set = ['Total_Revenue','FLIGHT_REV','boxcox_ancll_rev_ttl','boxcox_BKGS_QTY','boxcox_AA_OPER_REVNUE_SEG_AMT',
'boxcox_AA_OPER_REVNUE_SEG_QTY','boxcox_ACRU_AEM_QTY','boxcox_ACRU_FLIGHT_AEM_QTY','boxcox_ACRU_FLIGHT_SEG_QTY',
'boxcox_CHECK_BAGS_PER_TRIP_AVG','boxcox_FLOWN_SEG_QTY','boxcox_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
a= np.arange(0, 4, 100, dtype=None)
for i, variable in enumerate(boxcox_study_set):
plt.subplot(8,5,i+1)
sns.distplot(df_boxcox[variable],bins=a)
plt.tight_layout()
plt.title(variable)
#Histogram plot for total revenue on different bins
plt.hist(df['Total_Revenue'].dropna().values, bins=[0,50,100,150,200,250,300,350,400,450,500,550,600])
plt.show()
# Histogram plot for log total revenue on different bins
plt.hist(df_log['log_Total_Revenue'].dropna().values,bins=[0,1,2,3,4,5,6,7,8,9,10])
plt.show()
# Histogram plot for boxcox total revenue on different bins
plt.hist(df_boxcox['Total_Revenue'].dropna().values,bins=[0,50,100,150,200,250,300,350,400,450,500,550,600])
plt.show()
#df size of row and columns
df.shape[0],df.shape[1]
(1000, 86)
# Creating a dataframe for winsorization
df_win = df.copy()
# Apply one-way winsorization to the highest end of the value distribution. Because of extreme values, set the values of the outliers to the value of the 90th percentile.
df_win['win_Total_Revenue'] = winsorize(df_win['Total_Revenue'], (0, 0.1))
df_win['win_FLIGHT_REV'] = winsorize(df_win['FLIGHT_REV'], (0, 0.1))
df_win['win_ancll_rev_ttl'] = winsorize(df_win['ancll_rev_ttl'], (0, 0.1))
df_win['win_BKGS_QTY'] = winsorize(df_win['BKGS_QTY'], (0, 0.1))
df_win['win_AA_OPER_REVNUE_SEG_AMT'] = winsorize(df_win['AA_OPER_REVNUE_SEG_AMT'], (0, 0.1))
df_win['win_AA_OPER_REVNUE_SEG_QTY'] = winsorize(df_win['AA_OPER_REVNUE_SEG_QTY'], (0, 0.1))
df_win['win_ACRU_AEM_QTY'] = winsorize(df_win['ACRU_AEM_QTY'], (0, 0.1))
df_win['win_ACRU_FLIGHT_AEM_QTY'] = winsorize(df_win['ACRU_FLIGHT_AEM_QTY'], (0, 0.1))
df_win['win_ACRU_FLIGHT_SEG_QTY'] = winsorize(df_win['ACRU_FLIGHT_SEG_QTY'], (0, 0.1))
df_win['win_CHECK_BAGS_PER_TRIP_AVG'] = winsorize(df_win['CHECK_BAGS_PER_TRIP_AVG'], (0, 0.1))
df_win['win_FLOWN_SEG_QTY'] = winsorize(df_win['FLOWN_SEG_QTY'], (0, 0.1))
df_win['win_TRIP_QTY'] = winsorize(df_win['TRIP_QTY'], (0, 0.1))
#Boxplot on winsorized data set
win_study_set = ['win_Total_Revenue','win_FLIGHT_REV','win_ancll_rev_ttl','win_BKGS_QTY','win_AA_OPER_REVNUE_SEG_AMT',
'win_AA_OPER_REVNUE_SEG_QTY','win_ACRU_AEM_QTY','win_ACRU_FLIGHT_AEM_QTY','win_ACRU_FLIGHT_SEG_QTY','win_CHECK_BAGS_PER_TRIP_AVG','win_FLOWN_SEG_QTY','win_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(win_study_set):
plt.subplot(8,5,i+1)
sns.boxplot(y= df_win[variable], data=df_win)
plt.tight_layout()
plt.title(variable)
#lineplot on winsorized dataset
win_study_set = ['win_Total_Revenue','win_FLIGHT_REV','win_ancll_rev_ttl','win_BKGS_QTY','win_AA_OPER_REVNUE_SEG_AMT',
'win_AA_OPER_REVNUE_SEG_QTY','win_ACRU_AEM_QTY','win_ACRU_FLIGHT_AEM_QTY','win_ACRU_FLIGHT_SEG_QTY','win_CHECK_BAGS_PER_TRIP_AVG','win_FLOWN_SEG_QTY','win_TRIP_QTY']
plt.figure(figsize=(20,35))
sns.set_palette(sns.color_palette("Set3", 10))
sns.set_theme(style="dark")
sns.set_palette('Set3')
for i, variable in enumerate(win_study_set):
plt.subplot(8,5,i+1)
sns.distplot(df_win[variable])
plt.tight_layout()
plt.title(variable)
# z score on winsorized data and number of outliers
z_scores = zscore(df_win['win_Total_Revenue'])
for threshold in range(2,5):
print("The score threshold is: {}".format(threshold))
print("The indices of the outliers:")
print(np.where(z_scores > threshold))
print("Number of outliers is: {}".format(len((np.where(z_scores > threshold)[0]))))
The score threshold is: 3 The indices of the outliers: (array([], dtype=int64),) Number of outliers is: 0 The score threshold is: 4 The indices of the outliers: (array([], dtype=int64),) Number of outliers is: 0
Create subset dataframes:
df_subset_spend = df[df['Spend_Ind']==1]
df_subset_ancll_spend = df[df['ancillary_spend_ind']==1]
df.groupby(['LYLTY_LEVEL_CD'])['Total_Revenue'].describe().T
| LYLTY_LEVEL_CD | C | E | G | P | R | T |
|---|---|---|---|---|---|---|
| count | 5.00 | 81.00 | 162.00 | 100.00 | 619.00 | 33.00 |
| mean | 356.00 | 275.25 | 205.49 | 233.49 | 163.87 | 189.77 |
| std | 226.44 | 324.97 | 278.23 | 354.59 | 150.86 | 131.29 |
| min | 121.26 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 199.83 | 133.98 | 90.05 | 88.14 | 73.63 | 75.22 |
| 50% | 279.84 | 190.13 | 149.31 | 159.39 | 128.40 | 191.63 |
| 75% | 516.28 | 279.10 | 227.97 | 271.38 | 207.46 | 276.43 |
| max | 662.79 | 2299.57 | 2845.80 | 3121.78 | 1300.00 | 497.99 |
df_subset_spend.groupby(['LYLTY_LEVEL_CD'])['Total_Revenue'].describe().T
| LYLTY_LEVEL_CD | C | E | G | P | R | T |
|---|---|---|---|---|---|---|
| count | 5.00 | 77.00 | 147.00 | 93.00 | 560.00 | 30.00 |
| mean | 356.00 | 289.55 | 226.46 | 251.07 | 181.13 | 208.75 |
| std | 226.44 | 327.06 | 283.88 | 361.72 | 148.42 | 122.15 |
| min | 121.26 | 61.40 | 14.88 | 46.85 | 17.67 | 44.24 |
| 25% | 199.83 | 146.77 | 107.38 | 97.67 | 90.41 | 91.54 |
| 50% | 279.84 | 197.07 | 163.59 | 173.02 | 145.35 | 210.69 |
| 75% | 516.28 | 296.00 | 238.65 | 272.56 | 221.19 | 297.36 |
| max | 662.79 | 2299.57 | 2845.80 | 3121.78 | 1300.00 | 497.99 |
df_subset_ancll_spend.groupby(['LYLTY_LEVEL_CD'])['Total_Revenue'].describe().T
| LYLTY_LEVEL_CD | G | R |
|---|---|---|
| count | 16.00 | 159.00 |
| mean | 229.75 | 201.63 |
| std | 88.33 | 135.03 |
| min | 103.10 | 24.38 |
| 25% | 172.08 | 116.16 |
| 50% | 216.28 | 169.07 |
| 75% | 277.28 | 252.69 |
| max | 425.52 | 922.55 |
df.groupby(['LYLTY_LEVEL_CD'])['FLIGHT_REV'].describe().T
| LYLTY_LEVEL_CD | C | E | G | P | R | T |
|---|---|---|---|---|---|---|
| count | 5.00 | 81.00 | 162.00 | 100.00 | 619.00 | 33.00 |
| mean | 356.00 | 275.25 | 200.63 | 233.49 | 153.27 | 189.77 |
| std | 226.44 | 324.97 | 278.09 | 354.59 | 146.66 | 131.29 |
| min | 121.26 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 199.83 | 133.98 | 86.75 | 88.14 | 67.59 | 75.22 |
| 50% | 279.84 | 190.13 | 139.97 | 159.39 | 120.95 | 191.63 |
| 75% | 516.28 | 279.10 | 225.13 | 271.38 | 195.01 | 276.43 |
| max | 662.79 | 2299.57 | 2845.80 | 3121.78 | 1300.00 | 497.99 |
df_subset_spend.groupby(['LYLTY_LEVEL_CD'])['FLIGHT_REV'].describe().T
| LYLTY_LEVEL_CD | C | E | G | P | R | T |
|---|---|---|---|---|---|---|
| count | 5.00 | 77.00 | 147.00 | 93.00 | 560.00 | 30.00 |
| mean | 356.00 | 289.55 | 221.10 | 251.07 | 169.42 | 208.75 |
| std | 226.44 | 327.06 | 284.11 | 361.72 | 145.04 | 122.15 |
| min | 121.26 | 61.40 | 14.88 | 46.85 | 0.00 | 44.24 |
| 25% | 199.83 | 146.77 | 105.50 | 97.67 | 80.76 | 91.54 |
| 50% | 279.84 | 197.07 | 149.77 | 173.02 | 131.58 | 210.69 |
| 75% | 516.28 | 296.00 | 230.49 | 272.56 | 209.14 | 297.36 |
| max | 662.79 | 2299.57 | 2845.80 | 3121.78 | 1300.00 | 497.99 |
df_subset_ancll_spend.groupby(['LYLTY_LEVEL_CD'])['FLIGHT_REV'].describe().T
| LYLTY_LEVEL_CD | G | R |
|---|---|---|
| count | 16.00 | 159.00 |
| mean | 180.52 | 160.39 |
| std | 84.48 | 123.30 |
| min | 76.28 | 0.00 |
| 25% | 120.23 | 81.40 |
| 50% | 144.66 | 134.40 |
| 75% | 236.21 | 211.02 |
| max | 373.95 | 815.68 |
df.groupby(['LYLTY_LEVEL_CD'])['ancll_rev_ttl'].describe().T
| LYLTY_LEVEL_CD | C | E | G | P | R | T |
|---|---|---|---|---|---|---|
| count | 5.00 | 81.00 | 162.00 | 100.00 | 619.00 | 33.00 |
| mean | 0.00 | 0.00 | 4.86 | 0.00 | 10.59 | 0.00 |
| std | 0.00 | 0.00 | 16.07 | 0.00 | 22.15 | 0.00 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 50% | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 75% | 0.00 | 0.00 | 0.00 | 0.00 | 10.38 | 0.00 |
| max | 0.00 | 0.00 | 109.30 | 0.00 | 125.00 | 0.00 |
df_subset_spend.groupby(['LYLTY_LEVEL_CD'])['ancll_rev_ttl'].describe().T
| LYLTY_LEVEL_CD | C | E | G | P | R | T |
|---|---|---|---|---|---|---|
| count | 5.00 | 77.00 | 147.00 | 93.00 | 560.00 | 30.00 |
| mean | 0.00 | 0.00 | 5.36 | 0.00 | 11.71 | 0.00 |
| std | 0.00 | 0.00 | 16.80 | 0.00 | 23.00 | 0.00 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 50% | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 75% | 0.00 | 0.00 | 0.00 | 0.00 | 24.54 | 0.00 |
| max | 0.00 | 0.00 | 109.30 | 0.00 | 125.00 | 0.00 |
df_subset_ancll_spend.groupby(['LYLTY_LEVEL_CD'])['ancll_rev_ttl'].describe().T
| LYLTY_LEVEL_CD | G | R |
|---|---|---|
| count | 16.00 | 159.00 |
| mean | 49.23 | 41.24 |
| std | 21.02 | 25.43 |
| min | 26.82 | 6.63 |
| 25% | 34.75 | 30.00 |
| 50% | 40.00 | 30.00 |
| 75% | 59.93 | 42.47 |
| max | 109.30 | 125.00 |
df.groupby(['age_cat'])['INDVDL_AGE_QTY'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 36.00 | 249.00 | 331.00 | 342.00 | 42.00 |
| mean | 16.94 | 32.00 | 47.46 | 62.11 | 77.81 |
| std | 4.93 | 4.38 | 4.52 | 4.98 | 4.01 |
| min | 5.00 | 23.00 | 39.00 | 55.00 | 74.00 |
| 25% | 16.00 | 28.00 | 44.00 | 58.00 | 75.00 |
| 50% | 19.00 | 32.00 | 48.00 | 62.00 | 77.00 |
| 75% | 20.25 | 36.00 | 52.00 | 66.00 | 78.00 |
| max | 22.00 | 38.00 | 54.00 | 73.00 | 91.00 |
df_subset_spend.groupby(['age_cat'])['INDVDL_AGE_QTY'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 31.00 | 232.00 | 311.00 | 304.00 | 34.00 |
| mean | 16.52 | 32.02 | 47.46 | 61.96 | 78.15 |
| std | 5.11 | 4.34 | 4.54 | 4.97 | 4.35 |
| min | 5.00 | 23.00 | 39.00 | 55.00 | 74.00 |
| 25% | 14.50 | 28.00 | 44.00 | 58.00 | 75.00 |
| 50% | 19.00 | 32.00 | 48.00 | 61.00 | 77.00 |
| 75% | 20.00 | 36.00 | 52.00 | 66.00 | 79.00 |
| max | 22.00 | 38.00 | 54.00 | 73.00 | 91.00 |
df_subset_ancll_spend.groupby(['age_cat'])['INDVDL_AGE_QTY'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 7.00 | 45.00 | 57.00 | 56.00 | 10.00 |
| mean | 16.00 | 31.80 | 47.82 | 62.25 | 81.00 |
| std | 6.68 | 3.97 | 4.52 | 5.25 | 5.25 |
| min | 5.00 | 24.00 | 39.00 | 55.00 | 74.00 |
| 25% | 12.50 | 29.00 | 45.00 | 58.00 | 78.25 |
| 50% | 20.00 | 32.00 | 48.00 | 62.00 | 79.50 |
| 75% | 20.50 | 35.00 | 52.00 | 66.00 | 81.75 |
| max | 21.00 | 38.00 | 54.00 | 73.00 | 91.00 |
df.groupby(['age_cat'])['Total_Revenue'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 36.00 | 249.00 | 331.00 | 342.00 | 42.00 |
| mean | 142.77 | 170.25 | 217.48 | 178.85 | 183.93 |
| std | 116.71 | 159.77 | 294.59 | 183.85 | 227.24 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 68.23 | 73.37 | 99.45 | 79.95 | 57.01 |
| 50% | 138.20 | 130.50 | 163.59 | 136.14 | 121.12 |
| 75% | 174.18 | 208.37 | 244.50 | 227.28 | 239.73 |
| max | 532.51 | 1226.28 | 3121.78 | 1534.95 | 1300.00 |
df_subset_spend.groupby(['age_cat'])['Total_Revenue'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 31.00 | 232.00 | 311.00 | 304.00 | 34.00 |
| mean | 165.80 | 182.72 | 231.47 | 201.21 | 227.21 |
| std | 109.29 | 158.47 | 298.55 | 183.10 | 232.44 |
| min | 25.00 | 14.88 | 23.48 | 17.67 | 32.00 |
| 25% | 100.40 | 85.03 | 108.83 | 94.62 | 80.13 |
| 50% | 153.32 | 149.24 | 169.52 | 148.75 | 171.76 |
| 75% | 181.52 | 225.36 | 254.00 | 238.70 | 295.87 |
| max | 532.51 | 1226.28 | 3121.78 | 1534.95 | 1300.00 |
df_subset_ancll_spend.groupby(['age_cat'])['Total_Revenue'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 7.00 | 45.00 | 57.00 | 56.00 | 10.00 |
| mean | 126.09 | 183.66 | 224.42 | 210.34 | 201.69 |
| std | 65.23 | 106.06 | 153.28 | 134.88 | 97.32 |
| min | 25.00 | 30.00 | 30.00 | 24.38 | 74.68 |
| 25% | 95.90 | 111.86 | 137.88 | 121.77 | 117.69 |
| 50% | 112.79 | 168.21 | 176.98 | 175.59 | 195.46 |
| 75% | 164.85 | 199.30 | 270.27 | 275.35 | 295.87 |
| max | 223.33 | 565.49 | 922.55 | 742.47 | 326.92 |
df.groupby(['age_cat'])['FLIGHT_REV'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 36.00 | 249.00 | 331.00 | 342.00 | 42.00 |
| mean | 136.28 | 163.88 | 210.37 | 171.26 | 170.15 |
| std | 117.79 | 159.36 | 293.57 | 181.72 | 228.00 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 62.80 | 68.12 | 92.45 | 73.07 | 45.58 |
| 50% | 129.37 | 126.51 | 152.80 | 127.43 | 90.50 |
| 75% | 167.97 | 199.83 | 234.22 | 216.62 | 222.13 |
| max | 532.51 | 1226.28 | 3121.78 | 1534.95 | 1300.00 |
df_subset_spend.groupby(['age_cat'])['FLIGHT_REV'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 31.00 | 232.00 | 311.00 | 304.00 | 34.00 |
| mean | 158.26 | 175.89 | 223.90 | 192.67 | 210.18 |
| std | 112.22 | 158.57 | 297.84 | 181.73 | 236.47 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 79.90 | 77.15 | 107.38 | 90.45 | 74.92 |
| 50% | 153.32 | 135.57 | 161.64 | 143.72 | 157.20 |
| 75% | 171.30 | 214.65 | 244.50 | 228.61 | 240.93 |
| max | 532.51 | 1226.28 | 3121.78 | 1534.95 | 1300.00 |
df_subset_ancll_spend.groupby(['age_cat'])['FLIGHT_REV'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 7.00 | 45.00 | 57.00 | 56.00 | 10.00 |
| mean | 92.70 | 148.44 | 183.13 | 164.01 | 143.80 |
| std | 58.27 | 102.23 | 138.31 | 120.48 | 102.43 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 69.30 | 74.94 | 107.88 | 82.80 | 57.27 |
| 50% | 82.79 | 134.40 | 147.91 | 134.56 | 138.09 |
| 75% | 128.97 | 169.30 | 233.10 | 219.76 | 225.73 |
| max | 169.57 | 480.00 | 815.68 | 624.91 | 296.92 |
df.groupby(['age_cat'])['ancll_rev_ttl'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 36.00 | 249.00 | 331.00 | 342.00 | 42.00 |
| mean | 6.49 | 6.36 | 7.11 | 7.59 | 13.78 |
| std | 14.39 | 16.03 | 18.76 | 20.45 | 28.09 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 50% | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 75% | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| max | 53.76 | 100.00 | 125.00 | 117.56 | 100.00 |
df_subset_spend.groupby(['age_cat'])['ancll_rev_ttl'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 31.00 | 232.00 | 311.00 | 304.00 | 34.00 |
| mean | 7.54 | 6.83 | 7.57 | 8.53 | 17.03 |
| std | 15.27 | 16.51 | 19.27 | 21.50 | 30.38 |
| min | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25% | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 50% | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 75% | 0.00 | 0.00 | 0.00 | 0.00 | 30.00 |
| max | 53.76 | 100.00 | 125.00 | 117.56 | 100.00 |
df_subset_ancll_spend.groupby(['age_cat'])['ancll_rev_ttl'].describe().T
| age_cat | Z | Millennial | X | Boomers | Silent |
|---|---|---|---|---|---|
| count | 7.00 | 45.00 | 57.00 | 56.00 | 10.00 |
| mean | 33.39 | 35.22 | 41.29 | 46.33 | 57.89 |
| std | 12.63 | 20.22 | 25.25 | 27.65 | 27.51 |
| min | 18.91 | 8.11 | 11.08 | 6.63 | 30.00 |
| 25% | 25.00 | 30.00 | 30.00 | 30.00 | 31.09 |
| 50% | 30.00 | 30.00 | 30.20 | 30.00 | 56.94 |
| 75% | 40.53 | 30.00 | 40.00 | 65.23 | 80.21 |
| max | 53.76 | 100.00 | 125.00 | 117.56 | 100.00 |
#Barplot frequency on each features
categories= ['GENDER_CD','age_cat','aa_value_code','SEG_INTRVL_TYPE','CABIN_BOOKED','CABIN_FLOWN','BX_IND','BKG_BUSINES_LEISR_IND','LYLTY_LEVEL_CD','MACRO_SEG_ID',
'BARCLAYS_CARD_HOLDER_IND','CITIBNK_PREMIM_CARD_HLDR_IND','LOUNGE_MBR_PRTCPNT_IND','day_of_week_dep','dep_month','upgrade_ind','ACCT_FARE_CLASS_TYPE','TKT_TYPE','ancillary_spend_ind','DEP_CNTRY_CD','ARVL_CNTRY_CD','UPG_TYPE']
title=['GENDER_CD','age_cat','aa_value_code','SEG_INTRVL_TYPE','CABIN_BOOKED','CABIN_FLOWN','BX_IND','BKG_BUSINES_LEISR_IND','LYLTY_LEVEL_CD','MACRO_SEG_ID',
'BARCLAYS_CARD_HOLDER_IND','CITIBNK_PREMIM_CARD_HLDR_IND','LOUNGE_MBR_PRTCPNT_IND','day_of_week_dep','dep_month','upgrade_ind','ACCT_FARE_CLASS_TYPE','TKT_TYPE','ancillary_spend_ind','DEP_CNTRY_CD','ARVL_CNTRY_CD','UPG_TYPE']
plt.figure(figsize=(30 ,50))
sns.set(font_scale = 3)
sns.set_theme(style="dark")
for i, variable in enumerate(categories):
plt.subplot(9,3,i+1)
sns.set_palette('coolwarm')
ax=sns.countplot(x=df[variable], data=df, order = df[variable].value_counts().index)
for p in ax.patches:
percentage = '{:.1f}%'.format(100 * p.get_height()/len(df[variable]))
x = p.get_x() + p.get_width()
y = p.get_y() + p.get_height()
plt.annotate(percentage, (x, y),ha='center', fontsize = 16)
plt.tight_layout()
plt.title(title[i],fontsize = 16)
# Percentage of 'SEG_DEP_AIRPRT_IATA_CD' over 'SEG_DEP_AIRPRT_IATA_CD'
df['SEG_DEP_AIRPRT_IATA_CD'].value_counts().nlargest(10)/df['SEG_DEP_AIRPRT_IATA_CD'].count()*100
DFW 16.20 CLT 9.90 ORD 7.30 MIA 4.80 PHL 4.70 PHX 4.10 DCA 3.70 LAX 3.10 LGA 1.60 BOS 1.60 Name: SEG_DEP_AIRPRT_IATA_CD, dtype: float64
# Grouping 'SEG_DEP_AIRPRT_IATA_CD' and 'SEG_DEP_AIRPRT_IATA_CD' with their counts
df.groupby(['SEG_DEP_AIRPRT_IATA_CD']).SEG_DEP_AIRPRT_IATA_CD.value_counts().nlargest(10)
SEG_DEP_AIRPRT_IATA_CD SEG_DEP_AIRPRT_IATA_CD DFW DFW 162 CLT CLT 99 ORD ORD 73 MIA MIA 48 PHL PHL 47 PHX PHX 41 DCA DCA 37 LAX LAX 31 BOS BOS 16 LGA LGA 16 Name: SEG_DEP_AIRPRT_IATA_CD, dtype: int64
# Percentage of 'SEG_DEP_AIRPRT_IATA_CD' over 'SEG_DEP_AIRPRT_IATA_CD' where there is ancll spend
df_subset_ancll_spend['SEG_DEP_AIRPRT_IATA_CD'].value_counts().nlargest(10)/df_subset_ancll_spend['SEG_DEP_AIRPRT_IATA_CD'].count()*100
DFW 13.71 ORD 5.71 MIA 5.14 PHX 5.14 DCA 4.57 CLT 4.57 EWR 2.86 LAX 2.86 DEN 2.29 PHL 2.29 Name: SEG_DEP_AIRPRT_IATA_CD, dtype: float64
# Percentage of 'SEG_ARVL_AIRPRT_IATA_CD' over 'SEG_DEP_AIRPRT_IATA_CD' where there is ancll spend
df['SEG_ARVL_AIRPRT_IATA_CD'].value_counts().nlargest(10)/df['SEG_DEP_AIRPRT_IATA_CD'].count()*100
DFW 16.60 CLT 11.20 ORD 6.90 MIA 5.40 PHX 5.20 PHL 5.20 LAX 4.70 DCA 2.80 LGA 1.80 BOS 1.40 Name: SEG_ARVL_AIRPRT_IATA_CD, dtype: float64
# Grouping 'SEG_ARVL_AIRPRT_IATA_CD' and 'SEG_ARVL_AIRPRT_IATA_CD' with their counts
df.groupby(['SEG_ARVL_AIRPRT_IATA_CD']).SEG_ARVL_AIRPRT_IATA_CD.value_counts().nlargest(10)
SEG_ARVL_AIRPRT_IATA_CD SEG_ARVL_AIRPRT_IATA_CD DFW DFW 166 CLT CLT 112 ORD ORD 69 MIA MIA 54 PHL PHL 52 PHX PHX 52 LAX LAX 47 DCA DCA 28 LGA LGA 18 BOS BOS 14 Name: SEG_ARVL_AIRPRT_IATA_CD, dtype: int64
# Percentage of 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' over 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' where there is ancll spend
df['MKT_N_DIRECTN_AIRPRT_PAIR_CD'].value_counts().nlargest(10)/df['MKT_N_DIRECTN_AIRPRT_PAIR_CD'].count()*100
ORDPHL 1.10 DFWORD 1.10 DCADFW 1.00 DFWLAX 0.90 DFWLGA 0.90 LAXORD 0.90 DFWSAN 0.90 CLTDFW 0.80 PHLPHX 0.80 DFWLAS 0.80 Name: MKT_N_DIRECTN_AIRPRT_PAIR_CD, dtype: float64
# Percentage of 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' over 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' where there is ancll spend
df_subset_ancll_spend['MKT_N_DIRECTN_AIRPRT_PAIR_CD'].value_counts().nlargest(10)/df_subset_ancll_spend['MKT_N_DIRECTN_AIRPRT_PAIR_CD'].count()*100
DFWLAX 1.71 BNADFW 1.71 DFWSNA 1.71 DCADFW 1.14 CLTMIA 1.14 CLTDFW 1.14 DSMPHX 1.14 DFWORD 1.14 DFWMCI 1.14 DENDFW 1.14 Name: MKT_N_DIRECTN_AIRPRT_PAIR_CD, dtype: float64
# Grouping 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' and 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' with their counts
df.groupby(['MKT_N_DIRECTN_AIRPRT_PAIR_CD']).MKT_N_DIRECTN_AIRPRT_PAIR_CD.value_counts().nlargest(10)
MKT_N_DIRECTN_AIRPRT_PAIR_CD MKT_N_DIRECTN_AIRPRT_PAIR_CD DFWORD DFWORD 11 ORDPHL ORDPHL 11 DCADFW DCADFW 10 DFWLAX DFWLAX 9 DFWLGA DFWLGA 9 DFWSAN DFWSAN 9 LAXORD LAXORD 9 CLTDFW CLTDFW 8 DFWLAS DFWLAS 8 PHLPHX PHLPHX 8 Name: MKT_N_DIRECTN_AIRPRT_PAIR_CD, dtype: int64
#Count of 'DEP_CNTRY_CD' over 'DEP_CNTRY_CD'
df['DEP_CNTRY_CD'].value_counts()/df['DEP_CNTRY_CD'].count()
US 0.93 Other 0.05 MX 0.01 CA 0.01 GB 0.00 Name: DEP_CNTRY_CD, dtype: float64
#Count of 'DEP_CNTRY_CD' over 'DEP_CNTRY_CD' where there is ancll spend
df_subset_ancll_spend['DEP_CNTRY_CD'].value_counts()/df_subset_ancll_spend['DEP_CNTRY_CD'].count()
US 0.91 Other 0.06 MX 0.02 CA 0.01 GB 0.01 Name: DEP_CNTRY_CD, dtype: float64
#Count of 'ARVL_CNTRY_CD' over 'ARVL_CNTRY_CD'
df['ARVL_CNTRY_CD'].value_counts()/df['ARVL_CNTRY_CD'].count()
US 0.94 Other 0.04 GB 0.01 MX 0.01 CA 0.00 Name: ARVL_CNTRY_CD, dtype: float64
#Count of 'TACTL_ENTITY' over 'TACTL_ENTITY'
df['TACTL_ENTITY'].value_counts()/df['TACTL_ENTITY'].count()
S-SOUTH 0.08 MA-VA/WV/PA/OH 0.07 NE-H2H 0.07 NWC-TEXAS 0.06 MA-H2H 0.05 SW-S CALIFORNIA 0.04 S-FLORIDA 0.04 MA-CAROLINAS 0.04 MW-MIDWEST SOUTH 0.04 MW-MIDWEST NORTH 0.04 SW-N CALIFORNIA 0.04 SW-SW SMALL CITIES 0.04 NWC-CENTRAL/MOUNTAIN 0.03 NE-NEW YORK 0.03 MIA-H2H 0.03 MIA-HUB 0.03 SW-LAS 0.03 MW-H2H 0.03 CRB - PLS 0.02 NW/WEST CANADA 0.02 NE-NORTHEAST SPOKES 0.02 S-ORLANDO 0.01 C.EUROPE 0.01 UK 0.01 S-H2H 0.01 SW-HAWAII 0.01 C.AMERICA 0.01 MEX-PLE 0.01 MEX-BUS 0.01 NWC-CABIN/SKI 0.01 SW-H2H 0.01 CRB - VFR 0.01 NE-EAST CANADA 0.01 CRB - PR/VI 0.01 MA-DC AREA 0.01 PACIFIC 0.01 S.AMERICA 0.01 N.RIM 0.01 SPAIN 0.01 BRAZIL 0.00 AUSTRALIA/NZ 0.00 Name: TACTL_ENTITY, dtype: float64
# Grouping 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' and 'MKT_N_DIRECTN_AIRPRT_PAIR_CD' with their counts
df.groupby(['TACTL_ENTITY']).TACTL_ENTITY.value_counts().nlargest(10)
TACTL_ENTITY TACTL_ENTITY S-SOUTH S-SOUTH 80 MA-VA/WV/PA/OH MA-VA/WV/PA/OH 68 NE-H2H NE-H2H 67 NWC-TEXAS NWC-TEXAS 55 MA-H2H MA-H2H 49 S-FLORIDA S-FLORIDA 45 SW-S CALIFORNIA SW-S CALIFORNIA 45 MA-CAROLINAS MA-CAROLINAS 44 MW-MIDWEST NORTH MW-MIDWEST NORTH 38 MW-MIDWEST SOUTH MW-MIDWEST SOUTH 38 Name: TACTL_ENTITY, dtype: int64
x=df.groupby(['Spend_Ind'])['Spend_Ind'].count()
y=len(df)
percentage=((x/y)).round(2)
ratio = pd.DataFrame(percentage).T
# Spend distrbution
fig, ax = plt.subplots(1,1,figsize=(6, 3))
ax.barh(ratio.index, ratio[1], color='#1f77b4', alpha=1, label='Spend')
ax.barh(ratio.index, ratio[0], left=ratio[1], color='#d62728', alpha=1, label='No Spend')
ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticks([])
for i in ratio.index:
ax.annotate(f"{int(ratio[1][i]*100)}%", xy=(ratio[1][i]/2, i),va = 'center', ha='center',fontsize=20, fontweight='light', fontfamily='serif',color='black')
ax.annotate("Spend", xy=(ratio[1][i]/2, -0.25),va = 'center', ha='center',fontsize=20, fontweight='light', fontfamily='serif',color='black')
for i in ratio.index:
ax.annotate(f"{int(ratio[0][i]*100)}%",xy=(ratio[1][i]+ratio[0][i]/2, i),va = 'center', ha='center',fontsize=20, fontweight='light', fontfamily='serif',color='black')
ax.annotate("No Spend", xy=(ratio[1][i]+ratio[0][i]/2, -0.25),va = 'center', ha='center',fontsize=20, fontweight='light', fontfamily='serif',color='black')
fig.text(0.125,1.00,'Spend Distribution', fontfamily='serif',fontsize=15, fontweight='bold')
fig.text(0.125,.90,'92% of all transactions had spend greater than $0',fontfamily='serif',fontsize=12)
plt.show()
x=df_subset_spend.groupby(['ancillary_spend_ind'])['ancillary_spend_ind'].count()
y=len(df_subset_spend)
percentage=((x/y)).round(2)
ratio = pd.DataFrame(percentage).T
# Ancillary spend distrbution
fig, ax = plt.subplots(1,1,figsize=(6, 3))
ax.barh(ratio.index, ratio[1], color='#1f77b4', alpha=1, label='Spend')
ax.barh(ratio.index, ratio[0], left=ratio[1], color='#d62728', alpha=1, label='No Spend')
ax.set_xlim(0, 1)
ax.set_xticks([])
ax.set_yticks([])
for i in ratio.index:
ax.annotate(f"{int(ratio[1][i]*100)}%", xy=(ratio[1][i]/2, i),va = 'center', ha='center',fontsize=14, fontweight='light', fontfamily='serif',color='black')
ax.annotate("Ancillary Spend", xy=(ratio[1][i]/2, -0.25),va = 'center', ha='center',fontsize=14, fontweight='light', fontfamily='serif',color='black')
for i in ratio.index:
ax.annotate(f"{int(ratio[0][i]*100)}%",xy=(ratio[1][i]+ratio[0][i]/2, i),va = 'center', ha='center',fontsize=14, fontweight='light', fontfamily='serif',color='black')
ax.annotate("No Ancillary Spend", xy=(ratio[1][i]+ratio[0][i]/2, -0.25),va = 'center', ha='center',fontsize=14, fontweight='light', fontfamily='serif',color='black')
fig.text(0.125,1.00,'Ancillary Spend Distribution', fontfamily='serif',fontsize=15, fontweight='bold')
fig.text(0.125,.90,'Only 20% of all transactions with spend had ancillary charges',fontfamily='serif',fontsize=12)
plt.show()
flight_rev_ancl=df.groupby(['ancillary_spend_ind'])['FLIGHT_REV'].mean()
print(flight_rev_ancl)
ancillary_spend_ind 0 185.06 1 162.23 Name: FLIGHT_REV, dtype: float64
flight_rev_spend=df.groupby(['Spend_Ind'])['FLIGHT_REV'].mean()
print(flight_rev_spend)
Spend_Ind 0 0.00 1 198.54 Name: FLIGHT_REV, dtype: float64
total_rev_spend=df.groupby(['Spend_Ind'])['Total_Revenue'].mean()
print(total_rev_spend)
Spend_Ind 0 0.00 1 206.59 Name: Total_Revenue, dtype: float64
ancl_rev_spend=df.groupby(['Spend_Ind'])['ancll_rev_ttl'].mean()
print(ancl_rev_spend)
Spend_Ind 0 0.00 1 8.05 Name: ancll_rev_ttl, dtype: float64
total_rev=df.groupby(['ancillary_spend_ind','Spend_Ind'])['Total_Revenue'].mean()
print(total_rev)
ancillary_spend_ind Spend_Ind
0 0 0.00
1 207.16
1 1 204.20
Name: Total_Revenue, dtype: float64
flight_rev_ancl=df.groupby(['ancillary_spend_ind'])['FLIGHT_REV'].mean()
print(flight_rev_ancl)
ancillary_spend_ind 0 185.06 1 162.23 Name: FLIGHT_REV, dtype: float64
ancl_rev_ancl=df.groupby(['ancillary_spend_ind'])['ancll_rev_ttl'].mean()
print(ancl_rev_ancl)
ancillary_spend_ind 0 0.00 1 41.97 Name: ancll_rev_ttl, dtype: float64
grouped_rev_ind = df.groupby(['Spend_Ind', 'ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'ancll_rev_ttl': ['mean']})
grouped_rev_ind.columns = ['revenue mean','revenue count','ancl mean']
grouped_rev_ind = grouped_rev_ind.reset_index()
print(grouped_rev_ind)
Spend_Ind ancillary_spend_ind revenue mean revenue count ancl mean 0 0 0 0.00 88 0.00 1 1 0 207.16 737 0.00 2 1 1 204.20 175 41.97
grouped_value = df.groupby(['Spend_Ind', 'ancillary_spend_ind','aa_value_code']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
grouped_value.columns = ['revenue mean','revenue count','flight mean','ancillary mean']
grouped_value = grouped_value.reset_index()
grouped_value
| Spend_Ind | ancillary_spend_ind | aa_value_code | revenue mean | revenue count | flight mean | ancillary mean | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | High Value | 0.00 | 10 | 0.00 | 0.00 |
| 1 | 0 | 0 | Medium Value | 0.00 | 47 | 0.00 | 0.00 |
| 2 | 0 | 0 | Value | 0.00 | 31 | 0.00 | 0.00 |
| 3 | 1 | 0 | High Value | 282.83 | 192 | 282.83 | 0.00 |
| 4 | 1 | 0 | Medium Value | 178.22 | 297 | 178.22 | 0.00 |
| 5 | 1 | 0 | Value | 183.23 | 248 | 183.23 | 0.00 |
| 6 | 1 | 1 | High Value | 207.03 | 12 | 174.10 | 32.93 |
| 7 | 1 | 1 | Medium Value | 210.43 | 81 | 169.95 | 40.47 |
| 8 | 1 | 1 | Value | 197.64 | 82 | 152.86 | 44.78 |
#Total revenue, flight Revenue and ancillary revenue by customer value
# df_subset_spenda = df[df['ancillary_spend_ind']==1]
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.figure(figsize=(18,4))
plt.subplot(1,3,1)
ax = sns.barplot(x="Total_Revenue", y='aa_value_code', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.aa_value_code.value_counts().iloc[:4].index).set(title='Total Revenue by Customer Value', xlabel='Total Revenue', ylabel='Value Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)
# ##plt.xlim(0, 200)
plt.subplot(1,3,2)
ax = sns.barplot(x="FLIGHT_REV", y='aa_value_code', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.aa_value_code.value_counts().iloc[:4].index).set(title='Flight Revenue by Customer Value', xlabel='Flight Revenue', ylabel='Value Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)
plt.subplot(1,3,3)
ax = sns.barplot(x="ancll_rev_ttl", y='aa_value_code', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.aa_value_code.value_counts().iloc[:4].index).set(title='Ancillary Revenue by Customer Value', xlabel='Ancillary Revenue', ylabel='Value Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)
<matplotlib.legend.Legend at 0x7f17ecf79f28>
grouped_loyalty = df.groupby(['Spend_Ind', 'ancillary_spend_ind','LYLTY_LEVEL_CD']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
grouped_loyalty.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
grouped_loyalty = grouped_loyalty.reset_index()
grouped_loyalty
| Spend_Ind | ancillary_spend_ind | LYLTY_LEVEL_CD | ttl mean | ttl count | flight mean | ancllry mean | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | E | 0.00 | 4 | 0.00 | 0.00 |
| 1 | 0 | 0 | G | 0.00 | 15 | 0.00 | 0.00 |
| 2 | 0 | 0 | P | 0.00 | 7 | 0.00 | 0.00 |
| 3 | 0 | 0 | R | 0.00 | 59 | 0.00 | 0.00 |
| 4 | 0 | 0 | T | 0.00 | 3 | 0.00 | 0.00 |
| 5 | 1 | 0 | C | 356.00 | 5 | 356.00 | 0.00 |
| 6 | 1 | 0 | E | 289.55 | 77 | 289.55 | 0.00 |
| 7 | 1 | 0 | G | 226.06 | 131 | 226.06 | 0.00 |
| 8 | 1 | 0 | P | 251.07 | 93 | 251.07 | 0.00 |
| 9 | 1 | 0 | R | 173.00 | 401 | 173.00 | 0.00 |
| 10 | 1 | 0 | T | 208.75 | 30 | 208.75 | 0.00 |
| 11 | 1 | 1 | G | 229.75 | 16 | 180.52 | 49.23 |
| 12 | 1 | 1 | R | 201.63 | 159 | 160.39 | 41.24 |
#Total revenue, flight Revenue and ancillary revenue by loyalty level
sns.catplot(y='Total_Revenue',x='LYLTY_LEVEL_CD',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.LYLTY_LEVEL_CD.value_counts().iloc[:7].index).set(title='Total Revenue by Loyalty Level',
xlabel='Loyalty Level', ylabel='Total Revenue')
plt.ylim(0, 1000)
sns.catplot(y='FLIGHT_REV',x='LYLTY_LEVEL_CD',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.LYLTY_LEVEL_CD.value_counts().iloc[:7].index).set(title='Flight Revenue by Loyalty Level',
xlabel='Loyalty Level', ylabel='Flight Revenue')
plt.ylim(0, 1000)
sns.catplot(y='ancll_rev_ttl',x='LYLTY_LEVEL_CD',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.LYLTY_LEVEL_CD.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Loyalty Level',
xlabel='Loyalty Level', ylabel='Ancillary Revenue')
plt.ylim(0, 200)
(0.0, 200.0)
grouped_age = df.groupby(['Spend_Ind', 'ancillary_spend_ind','age_cat','GENDER_CD']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
grouped_age.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
grouped_age = grouped_age.reset_index()
grouped_age
| Spend_Ind | ancillary_spend_ind | age_cat | GENDER_CD | ttl mean | ttl count | flight mean | ancllry mean | |
|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | Boomers | F | 0.00 | 18.00 | 0.00 | 0.00 |
| 1 | 0 | 0 | Boomers | M | 0.00 | 20.00 | 0.00 | 0.00 |
| 2 | 0 | 0 | Boomers | U | nan | nan | nan | nan |
| 3 | 0 | 0 | Boomers | X | nan | nan | nan | nan |
| 4 | 0 | 0 | Millennial | F | 0.00 | 8.00 | 0.00 | 0.00 |
| 5 | 0 | 0 | Millennial | M | 0.00 | 9.00 | 0.00 | 0.00 |
| 6 | 0 | 0 | Millennial | U | nan | nan | nan | nan |
| 7 | 0 | 0 | Millennial | X | nan | nan | nan | nan |
| 8 | 0 | 0 | Silent | F | 0.00 | 4.00 | 0.00 | 0.00 |
| 9 | 0 | 0 | Silent | M | 0.00 | 4.00 | 0.00 | 0.00 |
| 10 | 0 | 0 | Silent | U | nan | nan | nan | nan |
| 11 | 0 | 0 | Silent | X | nan | nan | nan | nan |
| 12 | 0 | 0 | X | F | 0.00 | 6.00 | 0.00 | 0.00 |
| 13 | 0 | 0 | X | M | 0.00 | 14.00 | 0.00 | 0.00 |
| 14 | 0 | 0 | X | U | nan | nan | nan | nan |
| 15 | 0 | 0 | X | X | nan | nan | nan | nan |
| 16 | 0 | 0 | Z | F | 0.00 | 2.00 | 0.00 | 0.00 |
| 17 | 0 | 0 | Z | M | 0.00 | 3.00 | 0.00 | 0.00 |
| 18 | 0 | 0 | Z | U | nan | nan | nan | nan |
| 19 | 0 | 0 | Z | X | nan | nan | nan | nan |
| 20 | 0 | 1 | Boomers | F | nan | nan | nan | nan |
| 21 | 0 | 1 | Boomers | M | nan | nan | nan | nan |
| 22 | 0 | 1 | Boomers | U | nan | nan | nan | nan |
| 23 | 0 | 1 | Boomers | X | nan | nan | nan | nan |
| 24 | 0 | 1 | Millennial | F | nan | nan | nan | nan |
| 25 | 0 | 1 | Millennial | M | nan | nan | nan | nan |
| 26 | 0 | 1 | Millennial | U | nan | nan | nan | nan |
| 27 | 0 | 1 | Millennial | X | nan | nan | nan | nan |
| 28 | 0 | 1 | Silent | F | nan | nan | nan | nan |
| 29 | 0 | 1 | Silent | M | nan | nan | nan | nan |
| 30 | 0 | 1 | Silent | U | nan | nan | nan | nan |
| 31 | 0 | 1 | Silent | X | nan | nan | nan | nan |
| 32 | 0 | 1 | X | F | nan | nan | nan | nan |
| 33 | 0 | 1 | X | M | nan | nan | nan | nan |
| 34 | 0 | 1 | X | U | nan | nan | nan | nan |
| 35 | 0 | 1 | X | X | nan | nan | nan | nan |
| 36 | 0 | 1 | Z | F | nan | nan | nan | nan |
| 37 | 0 | 1 | Z | M | nan | nan | nan | nan |
| 38 | 0 | 1 | Z | U | nan | nan | nan | nan |
| 39 | 0 | 1 | Z | X | nan | nan | nan | nan |
| 40 | 1 | 0 | Boomers | F | 180.68 | 85.00 | 180.68 | 0.00 |
| 41 | 1 | 0 | Boomers | M | 208.77 | 163.00 | 208.77 | 0.00 |
| 42 | 1 | 0 | Boomers | U | nan | nan | nan | nan |
| 43 | 1 | 0 | Boomers | X | nan | nan | nan | nan |
| 44 | 1 | 0 | Millennial | F | 156.87 | 78.00 | 156.87 | 0.00 |
| 45 | 1 | 0 | Millennial | M | 201.15 | 105.00 | 201.15 | 0.00 |
| 46 | 1 | 0 | Millennial | U | 192.40 | 4.00 | 192.40 | 0.00 |
| 47 | 1 | 0 | Millennial | X | nan | nan | nan | nan |
| 48 | 1 | 0 | Silent | F | 184.91 | 14.00 | 184.91 | 0.00 |
| 49 | 1 | 0 | Silent | M | 311.93 | 10.00 | 311.93 | 0.00 |
| 50 | 1 | 0 | Silent | U | nan | nan | nan | nan |
| 51 | 1 | 0 | Silent | X | nan | nan | nan | nan |
| 52 | 1 | 0 | X | F | 185.25 | 79.00 | 185.25 | 0.00 |
| 53 | 1 | 0 | X | M | 255.64 | 171.00 | 255.64 | 0.00 |
| 54 | 1 | 0 | X | U | 245.40 | 3.00 | 245.40 | 0.00 |
| 55 | 1 | 0 | X | X | 108.78 | 1.00 | 108.78 | 0.00 |
| 56 | 1 | 0 | Z | F | 198.21 | 13.00 | 198.21 | 0.00 |
| 57 | 1 | 0 | Z | M | 152.77 | 11.00 | 152.77 | 0.00 |
| 58 | 1 | 0 | Z | U | nan | nan | nan | nan |
| 59 | 1 | 0 | Z | X | nan | nan | nan | nan |
| 60 | 1 | 1 | Boomers | F | 195.86 | 27.00 | 148.73 | 47.13 |
| 61 | 1 | 1 | Boomers | M | 228.19 | 28.00 | 182.05 | 46.14 |
| 62 | 1 | 1 | Boomers | U | 101.63 | 1.00 | 71.63 | 30.00 |
| 63 | 1 | 1 | Boomers | X | nan | nan | nan | nan |
| 64 | 1 | 1 | Millennial | F | 190.48 | 21.00 | 151.97 | 38.51 |
| 65 | 1 | 1 | Millennial | M | 177.68 | 24.00 | 145.35 | 32.34 |
| 66 | 1 | 1 | Millennial | U | nan | nan | nan | nan |
| 67 | 1 | 1 | Millennial | X | nan | nan | nan | nan |
| 68 | 1 | 1 | Silent | F | 216.41 | 6.00 | 159.82 | 56.59 |
| 69 | 1 | 1 | Silent | M | 179.62 | 4.00 | 119.78 | 59.84 |
| 70 | 1 | 1 | Silent | U | nan | nan | nan | nan |
| 71 | 1 | 1 | Silent | X | nan | nan | nan | nan |
| 72 | 1 | 1 | X | F | 189.30 | 17.00 | 156.13 | 33.16 |
| 73 | 1 | 1 | X | M | 239.35 | 40.00 | 194.60 | 44.74 |
| 74 | 1 | 1 | X | U | nan | nan | nan | nan |
| 75 | 1 | 1 | X | X | nan | nan | nan | nan |
| 76 | 1 | 1 | Z | F | 109.30 | 5.00 | 79.31 | 29.99 |
| 77 | 1 | 1 | Z | M | 168.06 | 2.00 | 126.18 | 41.88 |
| 78 | 1 | 1 | Z | U | nan | nan | nan | nan |
| 79 | 1 | 1 | Z | X | nan | nan | nan | nan |
#Total revenue, flight Revenue and ancillary revenue by Barcley Card
sns.catplot(y='Total_Revenue',x='BARCLAYS_CARD_HOLDER_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.BARCLAYS_CARD_HOLDER_IND.value_counts().iloc[:7].index).set(title='Total Revenue by Card Holder Status',
xlabel='Barcley Card Indicator', ylabel='Total Revenue')
plt.ylim(0, 250)
sns.catplot(y='FLIGHT_REV',x='BARCLAYS_CARD_HOLDER_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.BARCLAYS_CARD_HOLDER_IND.value_counts().iloc[:7].index).set(title='Flight Revenue by Card Holder Status',
xlabel='Barcley Card Indicator', ylabel='Flight Revenue')
plt.ylim(0, 250)
sns.catplot(y='ancll_rev_ttl',x='BARCLAYS_CARD_HOLDER_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.BARCLAYS_CARD_HOLDER_IND.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Card Holder Status',
xlabel='Barcley Card Indicator', ylabel='Ancillary Revenue')
plt.ylim(0, 75)
(0.0, 75.0)
barcley_value_multiple = df_subset_spend.groupby(['Spend_Ind', 'ancillary_spend_ind','BARCLAYS_CARD_HOLDER_IND']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
barcley_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
barcley_value_multiple = barcley_value_multiple.reset_index()
barcley_value_multiple
| Spend_Ind | ancillary_spend_ind | BARCLAYS_CARD_HOLDER_IND | ttl mean | ttl count | flight mean | ancllry mean | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | N | 207.84 | 647 | 207.84 | 0.00 |
| 1 | 1 | 0 | Y | 202.26 | 90 | 202.26 | 0.00 |
| 2 | 1 | 1 | N | 204.22 | 163 | 162.22 | 41.99 |
| 3 | 1 | 1 | Y | 203.98 | 12 | 162.28 | 41.70 |
#Total revenue, flight Revenue and ancillary revenue by Citi Card holder
sns.catplot(y='Total_Revenue',x='CITIBNK_PREMIM_CARD_HLDR_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.CITIBNK_PREMIM_CARD_HLDR_IND.value_counts().iloc[:7].index).set(title='Total Revenue by Card Holder Status',
xlabel='Citi Card Indicator', ylabel='Total Revenue')
plt.ylim(0, 250)
sns.catplot(y='FLIGHT_REV',x='CITIBNK_PREMIM_CARD_HLDR_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.CITIBNK_PREMIM_CARD_HLDR_IND.value_counts().iloc[:7].index).set(title='Flight Revenue by Card Holder Status',
xlabel='Citi Card Indicator', ylabel='Flight Revenue')
plt.ylim(0, 250)
sns.catplot(y='ancll_rev_ttl',x='CITIBNK_PREMIM_CARD_HLDR_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.CITIBNK_PREMIM_CARD_HLDR_IND.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Card Holder Status',
xlabel='Citi Card Indicator', ylabel='Ancillary Revenue')
plt.ylim(0, 75)
(0.0, 75.0)
citi_value_multiple = df_subset_spend.groupby(['Spend_Ind', 'ancillary_spend_ind','CITIBNK_PREMIM_CARD_HLDR_IND']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
citi_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
citi_value_multiple = citi_value_multiple.reset_index()
citi_value_multiple
| Spend_Ind | ancillary_spend_ind | CITIBNK_PREMIM_CARD_HLDR_IND | ttl mean | ttl count | flight mean | ancllry mean | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | N | 200.24 | 568 | 200.24 | 0.00 |
| 1 | 1 | 0 | Y | 230.41 | 169 | 230.41 | 0.00 |
| 2 | 1 | 1 | N | 202.28 | 155 | 160.44 | 41.84 |
| 3 | 1 | 1 | Y | 219.10 | 20 | 176.05 | 43.05 |
#Total revenue, flight Revenue and ancillary revenue by trip type
# df_subset_spenda = df[df['ancillary_spend_ind']==1]
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.figure(figsize=(18,5))
plt.subplot(1,3,1)
ax = sns.barplot(x="Total_Revenue", y='BKG_BUSINES_LEISR_IND', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.BKG_BUSINES_LEISR_IND.value_counts().iloc[:3].index).set(title='Total Revenue by Trip Type', xlabel='Total Revenue', ylabel='Trip Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)
# ##plt.xlim(0, 200)
plt.subplot(1,3,2)
ax = sns.barplot(x="FLIGHT_REV", y='BKG_BUSINES_LEISR_IND', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.BKG_BUSINES_LEISR_IND.value_counts().iloc[:3].index).set(title='Flight Revenue by Trip Type', xlabel='Flight Revenue', ylabel='Trip Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)
plt.subplot(1,3,3)
ax = sns.barplot(x="ancll_rev_ttl", y='BKG_BUSINES_LEISR_IND', hue="ancillary_spend_ind", data=df_subset_spend, ci=None, order=df_subset_spend.BKG_BUSINES_LEISR_IND.value_counts().iloc[:3].index).set(title='Ancillary Revenue by Trip Type', xlabel='Ancillary Revenue', ylabel='Trip Type')
plt.legend(bbox_to_anchor=(1.05, 1), loc=5, borderaxespad=0.)
<matplotlib.legend.Legend at 0x7f17e17fabe0>
# x=df.groupby(['Spend_Ind'],['ancillary_spend_ind'])['Total_Revenue'].mean()
# print(x)
BL_multiple = df.groupby(['Spend_Ind', 'ancillary_spend_ind','BKG_BUSINES_LEISR_IND']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
BL_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
BL_multiple = BL_multiple.reset_index()
BL_multiple
| Spend_Ind | ancillary_spend_ind | BKG_BUSINES_LEISR_IND | ttl mean | ttl count | flight mean | ancllry mean | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | B | 0.00 | 3 | 0.00 | 0.00 |
| 1 | 0 | 0 | L | 0.00 | 85 | 0.00 | 0.00 |
| 2 | 1 | 0 | B | 217.61 | 438 | 217.61 | 0.00 |
| 3 | 1 | 0 | L | 191.84 | 299 | 191.84 | 0.00 |
| 4 | 1 | 1 | B | 228.97 | 70 | 189.67 | 39.31 |
| 5 | 1 | 1 | L | 187.69 | 105 | 143.93 | 43.75 |
#Total revenue, flight Revenue and ancillary revenue by booking type
plt.figure(figsize=(16,6))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.subplot(1,3,1)
ax = sns.boxplot(x="Total_Revenue", y="BX_IND",hue='ancillary_spend_ind', data=df_subset_spend, order=df_subset_spend.BX_IND.value_counts().iloc[:7].index, whis= 4).set(title='Total Revenue by Booking Type',
xlabel='Total Revenue', ylabel='Business Account Indicator')
plt.xlim(0, 300)
plt.legend(bbox_to_anchor=(1.05, 1.1), loc=3, borderaxespad=0.)
plt.subplot(1,3,2)
ax = sns.boxplot(x="FLIGHT_REV", y="BX_IND",hue='ancillary_spend_ind', data=df_subset_spend, order=df_subset_spend.BX_IND.value_counts().iloc[:7].index, whis= 4).set(title='Flight Revenue by Booking Type',
xlabel='Flight Revenue', ylabel='Business Account Indicator')
plt.xlim(0, 300)
plt.legend(bbox_to_anchor=(1.05, 1.1), loc=3, borderaxespad=0.)
plt.subplot(1,3,3)
ax = sns.boxplot(x="ancll_rev_ttl", y="BX_IND",hue='ancillary_spend_ind', data=df_subset_spend, order=df_subset_spend.BX_IND.value_counts().iloc[:7].index, whis= 4).set(title='Ancillary Revenue by Booking Type',
xlabel='Ancillary Revenue', ylabel='Business Account Indicator')
plt.xlim(0, 75)
plt.legend(bbox_to_anchor=(1.05, 1.1), loc=3, borderaxespad=0.)
<matplotlib.legend.Legend at 0x7f17ec946a20>
bx_value_multiple = df_subset_spend.groupby(['Spend_Ind', 'ancillary_spend_ind','BX_IND']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
bx_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
bx_value_multiple = bx_value_multiple.reset_index()
bx_value_multiple
| Spend_Ind | ancillary_spend_ind | BX_IND | ttl mean | ttl count | flight mean | ancllry mean | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | N | 204.02 | 657 | 204.02 | 0.00 |
| 1 | 1 | 0 | Y | 232.95 | 80 | 232.95 | 0.00 |
| 2 | 1 | 1 | N | 203.54 | 165 | 161.50 | 42.04 |
| 3 | 1 | 1 | Y | 215.18 | 10 | 174.29 | 40.89 |
#Total revenue, flight Revenue and ancillary revenue by ticket type
sns.catplot(y='Total_Revenue',x='TKT_TYPE',hue='ancillary_spend_ind',kind='bar', data=df,order=df.TKT_TYPE.value_counts().iloc[:7].index).set(title='Total Revenue by Ticket Type',
xlabel='Ticket Type', ylabel='Total Revenue')
plt.ylim(0, 250)
sns.catplot(y='FLIGHT_REV',x='TKT_TYPE',hue='ancillary_spend_ind',kind='bar', data=df, order=df.TKT_TYPE.value_counts().iloc[:7].index).set(title='Flight Revenue by Ticket Type',
xlabel='Ticket Type', ylabel='Flight Revenue')
plt.ylim(0, 250)
sns.catplot(y='ancll_rev_ttl',x='TKT_TYPE',hue='ancillary_spend_ind',kind='bar', data=df, order=df.TKT_TYPE.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Ticket Type',
xlabel='Ticket Type', ylabel='Ancillary Revenue')
plt.ylim(0, 75)
(0.0, 75.0)
ticket_value_multiple = df.groupby(['Spend_Ind', 'ancillary_spend_ind','TKT_TYPE']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
ticket_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
ticket_value_multiple = ticket_value_multiple.reset_index()
ticket_value_multiple
| Spend_Ind | ancillary_spend_ind | TKT_TYPE | ttl mean | ttl count | flight mean | ancllry mean | |
|---|---|---|---|---|---|---|---|
| 0 | 0 | 0 | A | 0.00 | 88 | 0.00 | 0.00 |
| 1 | 1 | 0 | R | 207.16 | 737 | 207.16 | 0.00 |
| 2 | 1 | 1 | A | 49.81 | 9 | 0.00 | 49.81 |
| 3 | 1 | 1 | R | 212.57 | 166 | 171.02 | 41.55 |
#Total revenue, flight Revenue and ancillary revenue by cabin flown
sns.catplot(y='Total_Revenue',x='CABIN_FLOWN',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.CABIN_FLOWN.value_counts().iloc[:4].index, ci=None).set(title='Total Revenue by Cabin Flown',
xlabel='Cabin Flown', ylabel='Total Revenue')
plt.ylim(0, 3000)
sns.catplot(y='FLIGHT_REV',x='CABIN_FLOWN',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.CABIN_FLOWN.value_counts().iloc[:4].index, ci=None).set(title='Flight Revenue by Cabin Flown',
xlabel='Cabin Flow', ylabel='Flight Revenue')
plt.ylim(0, 3000)
sns.catplot(y='ancll_rev_ttl',x='CABIN_FLOWN',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.CABIN_FLOWN.value_counts().iloc[:4].index, ci=None).set(title='Ancillary Revenue by Cabin Flown',
xlabel='Cabin Flown', ylabel='Ancillary Revenue')
plt.ylim(0, 500)
# t1.CABIN_BOOKED, /*Y - main cabin, W - premium economy, C = business, F = first*/
# t1.CABIN_FLOWN, /*Y - main cabin, W - premium economy, C = business, F = first*/
(0.0, 500.0)
cabin_value_multiple = df_subset_spend.groupby(['Spend_Ind', 'ancillary_spend_ind','CABIN_FLOWN']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
cabin_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
cabin_value_multiple = cabin_value_multiple.reset_index()
cabin_value_multiple
| Spend_Ind | ancillary_spend_ind | CABIN_FLOWN | ttl mean | ttl count | flight mean | ancllry mean | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | C | 367.18 | 146 | 367.18 | 0.00 |
| 1 | 1 | 0 | W | 701.57 | 6 | 701.57 | 0.00 |
| 2 | 1 | 0 | Y | 162.15 | 585 | 162.15 | 0.00 |
| 3 | 1 | 1 | C | 413.21 | 5 | 353.66 | 59.55 |
| 4 | 1 | 1 | Y | 198.05 | 170 | 156.60 | 41.46 |
#Total revenue, flight Revenue and ancillary revenue by upgrage indicator
sns.catplot(y='Total_Revenue',x='upgrade_ind',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.upgrade_ind.value_counts().iloc[:4].index, ci=None).set(title='Total Revenue by Upgrade ',
xlabel='Upgrade Indicator', ylabel='Total Revenue')
plt.ylim(0, 700)
sns.catplot(y='FLIGHT_REV',x='upgrade_ind',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.upgrade_ind.value_counts().iloc[:4].index, ci=None).set(title='Flight Revenue by Upgrade',
xlabel='Upgrade Indicator', ylabel='Flight Revenue')
plt.ylim(0, 700)
sns.catplot(y='ancll_rev_ttl',x='upgrade_ind',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.upgrade_ind.value_counts().iloc[:4].index, ci=None).set(title='Ancillary Revenue by Upgrade',
xlabel='Upgrade Indicator', ylabel='Ancillary Revenue')
plt.ylim(0, 200)
(0.0, 200.0)
upgrade_value_multiple = df_subset_spend.groupby(['Spend_Ind', 'ancillary_spend_ind','upgrade_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean']})
upgrade_value_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean']
upgrade_value_multiple = upgrade_value_multiple.reset_index()
upgrade_value_multiple
| Spend_Ind | ancillary_spend_ind | upgrade_ind | ttl mean | ttl count | flight mean | ancllry mean | |
|---|---|---|---|---|---|---|---|
| 0 | 1 | 0 | 0 | 194.41 | 609 | 194.41 | 0.00 |
| 1 | 1 | 0 | 1 | 267.81 | 128 | 267.81 | 0.00 |
| 2 | 1 | 1 | 0 | 198.79 | 171 | 157.35 | 41.45 |
| 3 | 1 | 1 | 1 | 435.36 | 4 | 370.91 | 64.44 |
# Total Revenue on CITIBNK PREMIM CARD HLDR by BX_IND
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
sns.violinplot(data=df_subset_spend, x="BX_IND", y="Total_Revenue", hue="CITIBNK_PREMIM_CARD_HLDR_IND",
split=True, inner="quart", linewidth=1,
palette="coolwarm")
sns.despine(left=True)
plt.ylim(0, 1000)
(0.0, 1000.0)
# Total Revenue on CITIBNK PREMIM CARD HLDR by bussiness leisure indicator
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
g = sns.catplot(
data=df_subset_spend, kind="bar",
x="BKG_BUSINES_LEISR_IND", y="Total_Revenue", hue="CITIBNK_PREMIM_CARD_HLDR_IND",
ci="sd", palette="dark", alpha=.6, height=6
)
g.despine(left=True)
g.set_axis_labels("Business Leisure Indicator", "Total Revenue")
g.legend.set_title("Citibank Card Holder")
# Flight Revenue on CITIBNK PREMIM CARD HLDR by bussiness leisure indicator
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
g = sns.catplot(
data=df_subset_spend, kind="bar",
x="BKG_BUSINES_LEISR_IND", y="FLIGHT_REV", hue="CITIBNK_PREMIM_CARD_HLDR_IND",
ci="sd", palette="dark", alpha=.6, height=6
)
g.despine(left=True)
g.set_axis_labels("Business Leisure Indicator", "Flight Revenue")
g.legend.set_title("Citibank Card Holder")
# Total Ancillary on CITIBNK PREMIM CARD HLDR by bussiness leisure indicator
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
g = sns.catplot(
data=df_subset_spend, kind="bar",
x="BKG_BUSINES_LEISR_IND", y="ancll_rev_ttl", hue="CITIBNK_PREMIM_CARD_HLDR_IND",
ci="sd", palette="dark", alpha=.6, height=6
)
g.despine(left=True)
g.set_axis_labels("Business Leisure Indicator", "Ancillary Revenue")
g.legend.set_title("Citibank Card Holder")
# Total revenue on BARCLAYS CARD HOLDER
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
sns.boxplot(x="LYLTY_LEVEL_CD", y="Total_Revenue",
hue="BARCLAYS_CARD_HOLDER_IND",
data=df_subset_spend)
#sns.despine(offset=10, trim=True)
plt.ylim(0, 1000)
(0.0, 1000.0)
# Ancillary revenue on BARCLAYS CARD HOLDER
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
sns.boxplot(x="LYLTY_LEVEL_CD", y="ancll_rev_ttl",
hue="BARCLAYS_CARD_HOLDER_IND",
data=df_subset_ancll_spend)
#sns.despine(offset=10, trim=True)
plt.ylim(0, 800)
(0.0, 800.0)
# Total Revenue by Loyalty Level, age and gender
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.figure(figsize=(16,6))
ax = sns.boxplot(x="Total_Revenue", y="age_cat",hue='GENDER_CD', data=df_subset_spend, order=df_subset_spend.age_cat.value_counts().iloc[:7].index).set(title='Total Revenue by Loyalty Level',
xlabel='Total Revenue', ylabel='Age Category')
# # plot distplot
# fig, ax = plt.subplots()
# sns.distplot(df_subset_spend, ax = ax)
plt.xlim(0, 500)
# # change the limits of X-axis
# plt.show()
# loc, labels = plt.xticks()
# a = graph.set_xticklabels(labels, rotation=180)
(0.0, 500.0)
#Flight Revenue by Loyalty Level, age and gender
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.figure(figsize=(16,6))
ax = sns.boxplot(x="FLIGHT_REV", y="age_cat",hue='GENDER_CD', data=df_subset_spend, order=df_subset_spend.age_cat.value_counts().iloc[:7].index).set(title='Flight Revenue by Loyalty Level',
xlabel='Flight Revenue', ylabel='Age Category')
# # plot distplot
# fig, ax = plt.subplots()
# sns.distplot(df_subset_spend, ax = ax)
plt.xlim(0, 500)
# # change the limits of X-axis
# plt.show()
# loc, labels = plt.xticks()
# a = graph.set_xticklabels(labels, rotation=180)
(0.0, 500.0)
# Total revenue by loyalty level
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.figure(figsize=(16,6))
graph = sns.boxplot(x="Total_Revenue", y="LYLTY_LEVEL_CD", data=df_subset_spend, order=df_subset_spend.LYLTY_LEVEL_CD.value_counts().iloc[:7].index).set(title='Total Revenue by Loyalty Level',
xlabel='Total Revenue', ylabel='Loyalty Level')
# loc, labels = plt.xticks()
plt.xlim(0, 2000)
# a = graph.set_xticklabels(labels, rotation=180)
(0.0, 2000.0)
# Flight revenue by loyalty level
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.figure(figsize=(16,6))
graph = sns.boxplot(x="FLIGHT_REV", y="LYLTY_LEVEL_CD", data=df_subset_spend)
loc, labels = plt.xticks()
a = graph.set_xticklabels(labels, rotation=180)
plt.xlim(0, 2000)
(0.0, 2000.0)
#Total revenue, flight Revenue and ancillary revenue by age and gender
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.figure(figsize=(18,5))
plt.subplot(1,3,1)
sns.barplot(df_subset_spend["Total_Revenue"], df_subset_spend["age_cat"])
plt.title("Total Revenue by Gender")
plt.xlim(0, 500)
plt.subplot(1,3,2)
sns.barplot(df_subset_spend["FLIGHT_REV"], df_subset_spend["age_cat"])
plt.title("Flight Revenue by Gender")
plt.xlim(0, 500)
plt.subplot(1,3,3)
sns.barplot(df_subset_spend["ancll_rev_ttl"], df_subset_spend["age_cat"])
plt.title("Ancillary Revenue by Gender")
plt.xlim(0, 100)
plt.show()
# Flight Revenue by Age and Gender
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
ax = sns.barplot(x="FLIGHT_REV", y="age_cat", hue="GENDER_CD", data=df_subset_spend, ci=None, order=df_subset_spend.age_cat.value_counts().iloc[:7].index).set(title='Flight Revenue by Age and Gender', xlabel='', ylabel='')
plt.xlim(0, 500)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
<matplotlib.legend.Legend at 0x7f17e17039e8>
#Ancillary Revenue by Age and Gender
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
ax = sns.barplot(x="ancll_rev_ttl", y="age_cat", hue="GENDER_CD", data=df_subset_ancll_spend, ci=None, order=df_subset_ancll_spend.age_cat.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Age and Gender', xlabel='', ylabel='')
plt.xlim(0, 200)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
<matplotlib.legend.Legend at 0x7f17e1774710>
# Ancillary Revenue by Age and Gender
plt.figure(figsize=(18,5))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
ax = sns.barplot(x='ancll_rev_ttl', y="age_cat", hue="GENDER_CD", data=df_subset_ancll_spend, ci=None, order=df_subset_ancll_spend.age_cat.value_counts().iloc[:7].index).set(title='Ancillary Revenue by Age and Gender', xlabel='', ylabel='')
plt.xlim(0, 200)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
<matplotlib.legend.Legend at 0x7f17e1480dd8>
#Total Revenue by Age and Gender
plt.figure(figsize=(18,5))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
ax = sns.barplot(x='Total_Revenue', y="age_cat", hue="GENDER_CD", data=df_subset_ancll_spend, ci=None, order=df_subset_ancll_spend.age_cat.value_counts().iloc[:7].index).set(title='Total Revenue by Age and Gender', xlabel='', ylabel='')
plt.xlim(0, 1000)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
<matplotlib.legend.Legend at 0x7f17e0105198>
# Fare class analysis on total revenue, flight revenue and total ancillary
sns.catplot(y='Total_Revenue',x='ACCT_FARE_CLASS_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.ACCT_FARE_CLASS_TYPE.value_counts().iloc[:7].index, height=4, aspect=2.5, ci=None).set(title='Total Revenue by Fare Class',
xlabel='Fare Class', ylabel='Total Revenue')
plt.ylim(0, 3000)
sns.catplot(y='FLIGHT_REV',x='ACCT_FARE_CLASS_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.ACCT_FARE_CLASS_TYPE.value_counts().iloc[:7].index, height=4, aspect=2.5, ci=None).set(title='Flight Revenue by Fare Class',
xlabel='Fare Class', ylabel='Flight Revenue')
plt.ylim(0, 3000)
sns.catplot(y='ancll_rev_ttl',x='ACCT_FARE_CLASS_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.ACCT_FARE_CLASS_TYPE.value_counts().iloc[:7].index, height=4, aspect=2.5, ci=None).set(title='Ancillary Revenue by Fare Class',
xlabel='Fare Class', ylabel='Ancillary Revenue')
plt.ylim(0, 300)
(0.0, 300.0)
# Travel type analysis on total revenue, flight revenue and total ancillary
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
sns.catplot(y='Total_Revenue',x='SEG_INTRVL_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.SEG_INTRVL_TYPE.value_counts().iloc[:4].index, ci=None).set(title='Total Revenue by Segment ',
xlabel='Travel Type', ylabel='Total Revenue')
plt.ylim(0, 700)
sns.catplot(y='FLIGHT_REV',x='SEG_INTRVL_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.SEG_INTRVL_TYPE.value_counts().iloc[:4].index, ci=None).set(title='Flight Revenue by Segment',
xlabel='Travel Type', ylabel='Flight Revenue')
plt.ylim(0, 700)
sns.catplot(y='ancll_rev_ttl',x='SEG_INTRVL_TYPE',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.SEG_INTRVL_TYPE.value_counts().iloc[:4].index, ci=None).set(title='Ancillary Revenue by Segment',
xlabel='Travel Type', ylabel='Ancillary Revenue')
plt.ylim(0, 200)
(0.0, 200.0)
# Upgrade indicator analysis on total revenue, flight revenue and total ancillary by lounge member
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
sns.catplot(y='Total_Revenue',x='LOUNGE_MBR_PRTCPNT_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend,order=df_subset_spend.LOUNGE_MBR_PRTCPNT_IND.value_counts().iloc[:4].index, ci=None).set(title='Total Revenue by Lounge Membership',
xlabel='Upgrade Indicator', ylabel='Total Revenue')
plt.ylim(0, 700)
sns.catplot(y='FLIGHT_REV',x='LOUNGE_MBR_PRTCPNT_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.LOUNGE_MBR_PRTCPNT_IND.value_counts().iloc[:4].index, ci=None).set(title='Flight Revenue by Lounge Membership',
xlabel='Upgrade Indicator', ylabel='Flight Revenue')
plt.ylim(0, 700)
sns.catplot(y='ancll_rev_ttl',x='LOUNGE_MBR_PRTCPNT_IND',hue='ancillary_spend_ind',kind='bar', data=df_subset_spend, order=df_subset_spend.LOUNGE_MBR_PRTCPNT_IND.value_counts().iloc[:4].index, ci=None).set(title='Ancillary Revenue by Lounge Membership',
xlabel='Upgrade Indicator', ylabel='Ancillary Revenue')
plt.ylim(0, 200)
(0.0, 200.0)
#Day of the week analysis on total revenue, flight revenue and total ancillary
plt.figure(figsize=(18,5))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.subplot(1,3,1)
sns.barplot(df_subset_spend["Total_Revenue"], df_subset_spend["day_of_week_dep"])
plt.title("Total Revenue by Day of Week")
plt.subplot(1,3,2)
sns.barplot(df_subset_spend["FLIGHT_REV"], df_subset_spend["day_of_week_dep"])
plt.title("Flight Revenue by Day of Week")
plt.subplot(1,3,3)
sns.barplot(df_subset_spend["ancll_rev_ttl"], df_subset_spend["day_of_week_dep"])
plt.title("Ancillary Revenue by Day of Week")
plt.show()
#Monthly analysis on total revenue, flight revenue and total ancillary
plt.figure(figsize=(18,5))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.subplot(1,3,1)
sns.barplot(df_subset_spend["Total_Revenue"], df_subset_spend["dep_month"])
plt.title("Total Revenue by Month")
plt.subplot(1,3,2)
sns.barplot(df_subset_spend["FLIGHT_REV"], df_subset_spend["dep_month"])
plt.title("Flight Revenue by Month")
plt.subplot(1,3,3)
sns.barplot(df_subset_spend["ancll_rev_ttl"], df_subset_spend["dep_month"])
plt.title("Ancillary Revenue by Month")
plt.show()
# Plot production Revenue by month
df_monthly_mean = df_subset_spend.groupby(by="dep_month").mean()
plt.figure(figsize=(18,5))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.bar(df_monthly_mean.index, height=df_monthly_mean['Total_Revenue'].values)
plt.ylabel('mean(Total_Revenue)')
plt.xlabel('Month')
Text(0.5, 0, 'Month')
# Plot production Revenue by day of week
df_weekly_mean = df_subset_spend.groupby(by="day_of_week_dep").mean()
plt.figure(figsize=(15,6))
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
plt.bar(df_weekly_mean.index, height=df_weekly_mean['Total_Revenue'].values)
plt.ylabel('mean(Total_Revenue)')
plt.xlabel('Day of Week Departure')
Text(0.5, 0, 'Day of Week Departure')
#New dataframe for scatter plot
df_scatter = df_subset_spend[['Total_Revenue',
'FLIGHT_REV',
'ancll_rev_ttl','rev_pref_seats','rev_bag_non_cat','rev_upgrade','days_before_dept',
'MILE_GREAT_CIRCLE_DISTANC_QTY']].copy()
# Scatter plot
sns.set_palette(sns.color_palette("coolwarm", 16))
sns.pairplot(df_scatter,diag_kind = 'kde',
size = 4, corner=False)
plt.show()
#limiting dataframe for numerical features
df_c=df[[
'AA_OPER_REVNUE_SEG_AMT',
'CHECK_BAGS_PER_TRIP_AVG',
'FLOWN_SEG_QTY',
'days_before_dept',
'MILE_GREAT_CIRCLE_DISTANC_QTY',
'FLIGHT_REV',
'ancll_rev_ttl',
'rev_pref_seats',
'rev_bag_non_cat',
'rev_other_non_cat',
'rev_mileage_multiplier',
'rev_tsa',
'rev_upgrade',
'Total_Revenue',
"Spend_Ind"
]]
df_sc=df_c[(df_c["Spend_Ind"] == 1)]
df_n=df_sc[[
'AA_OPER_REVNUE_SEG_AMT',
'CHECK_BAGS_PER_TRIP_AVG',
'FLOWN_SEG_QTY',
'days_before_dept',
'MILE_GREAT_CIRCLE_DISTANC_QTY',
'FLIGHT_REV',
'ancll_rev_ttl',
'rev_pref_seats',
'rev_bag_non_cat',
'rev_other_non_cat',
'rev_mileage_multiplier',
'rev_tsa',
'rev_upgrade',
'Total_Revenue'
]]
# Plot corrolation heatmap
print("matplotlib version " + matplotlib.__version__)
print("seaborn version " + sns.__version__)
sns.set(style='white')
corr = df_n.corr()
plt.figure(figsize=(12,12))
sns.set(font_scale=1)
ax=sns.heatmap(data=corr,
center=0,
cmap=sns.diverging_palette(230, 10, as_cmap=True),
square=True, linewidth=0.5)
ax.set_title('Feature Corrolation Heatmap')
matplotlib version 3.3.4 seaborn version 0.11.2
Text(0.5, 1.0, 'Feature Corrolation Heatmap')
plt.figure(figsize=(10,10))
sns.heatmap(df_scatter.corr(),annot=True)
plt.show()
#Revenue breakdown annually
dep_multiple = df_subset_spend.groupby(['Spend_Ind','dep_month','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
'rev_bag_non_cat': ['mean'],
'rev_other_non_cat':['mean'],
'rev_upgrade': ['mean'],
'rev_tsa': ['mean'],
'rev_mileage_multiplier': ['mean'],
'rev_bag': ['mean'],
'rev_pref_seats': ['mean'],})
dep_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
dep_multiple = dep_multiple.reset_index()
dep_multiple
| Spend_Ind | dep_month | ancillary_spend_ind | ttl mean | ttl count | flight mean | ancllry mean | rev_bag_non_cat | rev_other_non_cat | rev_upgrade | rev_tsa | rev_mileage_multiplier | rev_bag | rev_pref_seats | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0 | 234.63 | 59 | 234.63 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 1 | 1 | 1 | 1 | 177.35 | 19 | 135.86 | 41.49 | 19.74 | 3.95 | 0.00 | 0.00 | 0.00 | 0.00 | 17.81 |
| 2 | 1 | 10 | 0 | 238.73 | 60 | 238.73 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 3 | 1 | 10 | 1 | 224.13 | 9 | 188.49 | 35.64 | 17.78 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 17.86 |
| 4 | 1 | 11 | 0 | 180.13 | 61 | 180.13 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 5 | 1 | 11 | 1 | 225.12 | 18 | 178.82 | 46.30 | 18.53 | 0.00 | 2.38 | 0.00 | 0.00 | 0.00 | 25.39 |
| 6 | 1 | 12 | 0 | 252.99 | 54 | 252.99 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 7 | 1 | 12 | 1 | 193.51 | 13 | 155.73 | 37.77 | 16.92 | 0.00 | 3.29 | 0.00 | 0.00 | 0.00 | 17.56 |
| 8 | 1 | 2 | 0 | 186.85 | 54 | 186.85 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 9 | 1 | 2 | 1 | 166.50 | 14 | 132.24 | 34.27 | 28.57 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 5.70 |
| 10 | 1 | 3 | 0 | 229.66 | 55 | 229.66 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 11 | 1 | 3 | 1 | 272.24 | 12 | 223.85 | 48.38 | 17.92 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 30.47 |
| 12 | 1 | 4 | 0 | 192.99 | 70 | 192.99 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 13 | 1 | 4 | 1 | 255.67 | 10 | 200.82 | 54.85 | 10.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 44.85 |
| 14 | 1 | 5 | 0 | 194.74 | 65 | 194.74 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 15 | 1 | 5 | 1 | 167.17 | 9 | 131.57 | 35.60 | 13.33 | 8.33 | 0.00 | 0.00 | 0.00 | 0.00 | 13.93 |
| 16 | 1 | 6 | 0 | 225.52 | 62 | 225.52 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 17 | 1 | 6 | 1 | 189.20 | 24 | 144.79 | 44.41 | 21.88 | 3.12 | 0.00 | 1.46 | 0.00 | 0.00 | 17.95 |
| 18 | 1 | 7 | 0 | 214.20 | 79 | 214.20 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 19 | 1 | 7 | 1 | 201.35 | 17 | 167.69 | 33.65 | 19.41 | 4.41 | 0.00 | 0.00 | 3.03 | 0.00 | 6.80 |
| 20 | 1 | 8 | 0 | 188.24 | 51 | 188.24 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 21 | 1 | 8 | 1 | 227.10 | 14 | 186.19 | 40.92 | 17.20 | 0.00 | 6.12 | 0.00 | 0.00 | 0.00 | 17.59 |
| 22 | 1 | 9 | 0 | 156.21 | 67 | 156.21 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 23 | 1 | 9 | 1 | 186.16 | 16 | 137.91 | 48.25 | 23.74 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 24.50 |
#Revenu breakdown weekly
dow_multiple = df_subset_spend.groupby(['Spend_Ind','day_of_week_dep','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
'rev_bag_non_cat': ['mean'],
'rev_other_non_cat':['mean'],
'rev_upgrade': ['mean'],
'rev_tsa': ['mean'],
'rev_mileage_multiplier': ['mean'],
'rev_bag': ['mean'],
'rev_pref_seats': ['mean'],})
dow_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
dow_multiple = dow_multiple.reset_index()
dow_multiple
| Spend_Ind | day_of_week_dep | ancillary_spend_ind | ttl mean | ttl count | flight mean | ancllry mean | rev_bag_non_cat | rev_other_non_cat | rev_upgrade | rev_tsa | rev_mileage_multiplier | rev_bag | rev_pref_seats | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 1 | 0 | 247.37 | 93 | 247.37 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 1 | 1 | 1 | 1 | 232.09 | 29 | 187.63 | 44.45 | 15.86 | 0.00 | 4.43 | 0.00 | 0.00 | 0.00 | 24.16 |
| 2 | 1 | 2 | 0 | 181.06 | 132 | 181.06 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 3 | 1 | 2 | 1 | 254.24 | 18 | 207.45 | 46.79 | 27.42 | 0.00 | 0.00 | 0.00 | 2.87 | 0.00 | 16.50 |
| 4 | 1 | 3 | 0 | 206.68 | 101 | 206.68 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 5 | 1 | 3 | 1 | 202.42 | 26 | 159.71 | 42.71 | 19.62 | 2.88 | 0.00 | 0.00 | 0.00 | 0.00 | 20.21 |
| 6 | 1 | 4 | 0 | 162.98 | 102 | 162.98 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 7 | 1 | 4 | 1 | 162.52 | 18 | 126.34 | 36.18 | 22.22 | 0.00 | 2.38 | 0.00 | 0.00 | 0.00 | 11.58 |
| 8 | 1 | 5 | 0 | 227.58 | 117 | 227.58 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 9 | 1 | 5 | 1 | 182.36 | 27 | 138.42 | 43.94 | 15.02 | 8.33 | 0.00 | 1.30 | 0.00 | 0.00 | 19.28 |
| 10 | 1 | 6 | 0 | 197.91 | 110 | 197.91 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 11 | 1 | 6 | 1 | 216.33 | 31 | 175.68 | 40.65 | 18.71 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 21.94 |
| 12 | 1 | 7 | 0 | 242.36 | 82 | 242.36 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 13 | 1 | 7 | 1 | 177.32 | 26 | 138.62 | 38.70 | 21.16 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 17.54 |
#Revenu breakdown on Domestic arrival
cd_arriv_multiple = US_df.groupby(['Spend_Ind','city_arr','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
'rev_bag_non_cat': ['mean'],
'rev_other_non_cat':['mean'],
'rev_upgrade': ['mean'],
'rev_tsa': ['mean'],
'rev_mileage_multiplier': ['mean'],
'rev_bag': ['mean'],
'rev_pref_seats': ['mean'],})
cd_arriv_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
cd_arriv_multiple = cd_arriv_multiple.reset_index()
cd_arriv_multiple
| Spend_Ind | city_arr | ancillary_spend_ind | ttl mean | ttl count | flight mean | ancllry mean | rev_bag_non_cat | rev_other_non_cat | rev_upgrade | rev_tsa | rev_mileage_multiplier | rev_bag | rev_pref_seats | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Austin | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 1 | 0 | Charlotte | 0 | 0.00 | 3 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 2 | 0 | Chicago | 0 | 0.00 | 5 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 3 | 0 | Cleveland | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 4 | 0 | Dallas-Fort Worth | 0 | 0.00 | 16 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 132 | 1 | White Plains | 0 | 357.32 | 1 | 357.32 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 133 | 1 | Wichita | 0 | 140.52 | 2 | 140.52 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 134 | 1 | Wilmington | 0 | 67.00 | 3 | 67.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 135 | 1 | Windsor Locks | 0 | 353.37 | 3 | 353.37 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 136 | 1 | Yuma | 0 | 80.72 | 1 | 80.72 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
137 rows × 14 columns
#Revenu breakdown on Domestic departure
cd_dept_multiple = US_df.groupby(['Spend_Ind','city_dep','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
'rev_bag_non_cat': ['mean'],
'rev_other_non_cat':['mean'],
'rev_upgrade': ['mean'],
'rev_tsa': ['mean'],
'rev_mileage_multiplier': ['mean'],
'rev_bag': ['mean'],
'rev_pref_seats': ['mean'],})
cd_dept_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
cd_dept_multiple = cd_dept_multiple.reset_index()
cd_dept_multiple
| Spend_Ind | city_dep | ancillary_spend_ind | ttl mean | ttl count | flight mean | ancllry mean | rev_bag_non_cat | rev_other_non_cat | rev_upgrade | rev_tsa | rev_mileage_multiplier | rev_bag | rev_pref_seats | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Aspen | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 1 | 0 | Atlanta | 0 | 0.00 | 2 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 2 | 0 | Baltimore | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 3 | 0 | Boston | 0 | 0.00 | 2 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 4 | 0 | Charlotte | 0 | 0.00 | 10 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 206 | 1 | White Plains | 0 | 226.98 | 1 | 226.98 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 207 | 1 | Wilmington | 0 | 74.89 | 1 | 74.89 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 208 | 1 | Windsor Locks | 0 | 131.11 | 2 | 131.11 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 209 | 1 | Windsor Locks | 1 | 111.86 | 1 | 81.86 | 30.00 | 30.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 210 | 1 | Yuma | 0 | 91.06 | 2 | 91.06 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
211 rows × 14 columns
#Revenue breakdown on international arrival
ci_arrv_multiple = overseas_df.groupby(['Spend_Ind','city_arr','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
'rev_bag_non_cat': ['mean'],
'rev_other_non_cat':['mean'],
'rev_upgrade': ['mean'],
'rev_tsa': ['mean'],
'rev_mileage_multiplier': ['mean'],
'rev_bag': ['mean'],
'rev_pref_seats': ['mean'],})
ci_arrv_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
ci_arrv_multiple = ci_arrv_multiple.reset_index()
ci_arrv_multiple
| Spend_Ind | city_arr | ancillary_spend_ind | ttl mean | ttl count | flight mean | ancllry mean | rev_bag_non_cat | rev_other_non_cat | rev_upgrade | rev_tsa | rev_mileage_multiplier | rev_bag | rev_pref_seats | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | Cancun | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 1 | 0 | Charlotte | 0 | 0.00 | 2 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 2 | 0 | Chicago | 0 | 0.00 | 2 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 3 | 0 | Miami | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 4 | 0 | Philadelphia | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 5 | 1 | Basse Terre | 0 | 303.18 | 2 | 303.18 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 6 | 1 | Bogota | 0 | 75.00 | 1 | 75.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 7 | 1 | Bridgetown | 1 | 196.00 | 1 | 166.00 | 30.00 | 30.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 8 | 1 | Buenos Aires | 0 | 533.00 | 2 | 533.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 9 | 1 | Cancun | 0 | 116.58 | 2 | 116.58 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 10 | 1 | Charlotte | 0 | 137.66 | 4 | 137.66 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 11 | 1 | Charlotte | 1 | 281.20 | 3 | 211.67 | 69.53 | 46.19 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 23.35 |
| 12 | 1 | Chicago | 0 | 811.93 | 1 | 811.93 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 13 | 1 | Chicago | 1 | 558.85 | 4 | 463.13 | 95.72 | 41.22 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 54.50 |
| 14 | 1 | Dallas-Fort Worth | 0 | 371.83 | 13 | 371.83 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 15 | 1 | Dallas-Fort Worth | 1 | 434.81 | 3 | 382.18 | 52.63 | 10.21 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 42.42 |
| 16 | 1 | Guadalajara | 1 | 119.00 | 1 | 89.00 | 30.00 | 30.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 17 | 1 | London | 0 | 1278.07 | 8 | 1278.07 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 18 | 1 | London | 1 | 363.84 | 2 | 315.00 | 48.83 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 48.83 |
| 19 | 1 | Los Angeles | 0 | 558.05 | 2 | 558.05 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 20 | 1 | Madrid | 0 | 914.00 | 2 | 914.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 21 | 1 | Mexico City | 0 | 283.00 | 1 | 283.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 22 | 1 | Miami | 0 | 268.83 | 18 | 268.83 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 23 | 1 | Miami | 1 | 262.86 | 5 | 209.45 | 53.41 | 34.04 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 19.37 |
| 24 | 1 | Montego Bay | 0 | 72.30 | 2 | 72.30 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 25 | 1 | Monterrey | 0 | 326.00 | 1 | 326.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 26 | 1 | Monterrey | 1 | 341.94 | 1 | 307.00 | 34.94 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 34.94 |
| 27 | 1 | New York | 0 | 228.71 | 4 | 228.71 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 28 | 1 | Paris | 0 | 878.20 | 1 | 878.20 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 29 | 1 | Philadelphia | 0 | 655.02 | 7 | 655.02 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 30 | 1 | Phoenix | 0 | 196.71 | 1 | 196.71 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 31 | 1 | Puerto Vallarta | 0 | 113.02 | 1 | 113.02 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 32 | 1 | Punta Cana | 1 | 177.79 | 1 | 120.00 | 57.79 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 57.79 |
| 33 | 1 | San Juan | 0 | 193.67 | 1 | 193.67 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 34 | 1 | San Pedro Sula | 0 | 61.50 | 1 | 61.50 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 35 | 1 | Santiago | 0 | 426.95 | 1 | 426.95 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 36 | 1 | Santo Domingo | 0 | 144.63 | 2 | 144.63 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 37 | 1 | Tokyo | 0 | 1036.05 | 1 | 1036.05 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 38 | 1 | Toronto | 0 | 247.25 | 2 | 247.25 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 39 | 1 | Willemstad | 0 | 90.58 | 1 | 90.58 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
#Revenue breakdown on most frequent routes
mk_dir_multiple = US_df.groupby(['Spend_Ind','MKT_N_DIRECTN_AIRPRT_PAIR_CD','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
'rev_bag_non_cat': ['mean'],
'rev_other_non_cat':['mean'],
'rev_upgrade': ['mean'],
'rev_tsa': ['mean'],
'rev_mileage_multiplier': ['mean'],
'rev_bag': ['mean'],
'rev_pref_seats': ['mean'],})
mk_dir_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat', 'rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
mk_dir_multiple = mk_dir_multiple.reset_index()
mk_dir_multiple
| Spend_Ind | MKT_N_DIRECTN_AIRPRT_PAIR_CD | ancillary_spend_ind | ttl mean | ttl count | flight mean | ancllry mean | rev_bag_non_cat | rev_other_non_cat | rev_upgrade | rev_tsa | rev_mileage_multiplier | rev_bag | rev_pref_seats | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | ASEORD | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 1 | 0 | ATLDFW | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 2 | 0 | ATLPHL | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 3 | 0 | AUSDFW | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 4 | 0 | BDLCLT | 0 | 0.00 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 535 | 1 | PHXSNA | 0 | 115.35 | 1 | 115.35 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 536 | 1 | PHXSNA | 1 | 144.32 | 1 | 97.56 | 46.76 | 30.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 16.76 |
| 537 | 1 | PHXTPA | 0 | 224.38 | 1 | 224.38 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 538 | 1 | PHXTUS | 0 | 103.48 | 4 | 103.48 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 539 | 1 | PHXYUM | 0 | 62.48 | 2 | 62.48 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
540 rows × 14 columns
#Revenue breakdown on age groups
age_cat_multiple = df_subset_spend.groupby(['Spend_Ind','age_cat','ancillary_spend_ind']).agg({'Total_Revenue': ['mean','count'],'FLIGHT_REV': ['mean'],'ancll_rev_ttl': ['mean'],
'rev_bag_non_cat': ['mean'],
'rev_other_non_cat':['mean'],
'rev_upgrade': ['mean'],
'rev_tsa': ['mean'],
'rev_mileage_multiplier': ['mean'],
'rev_bag': ['mean'],
'rev_pref_seats': ['mean'],})
age_cat_multiple.columns = ['ttl mean','ttl count','flight mean','ancllry mean','rev_bag_non_cat','rev_other_non_cat','rev_upgrade','rev_tsa','rev_mileage_multiplier','rev_bag','rev_pref_seats']
age_cat_multiple = age_cat_multiple.reset_index()
age_cat_multiple
| Spend_Ind | age_cat | ancillary_spend_ind | ttl mean | ttl count | flight mean | ancllry mean | rev_bag_non_cat | rev_other_non_cat | rev_upgrade | rev_tsa | rev_mileage_multiplier | rev_bag | rev_pref_seats | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | Z | 0 | 177.38 | 24 | 177.38 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 1 | 1 | Z | 1 | 126.09 | 7 | 92.70 | 33.39 | 15.71 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 17.68 |
| 2 | 1 | Millennial | 0 | 182.50 | 187 | 182.50 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 3 | 1 | Millennial | 1 | 183.66 | 45 | 148.44 | 35.22 | 20.67 | 1.67 | 0.00 | 0.00 | 0.00 | 0.00 | 12.88 |
| 4 | 1 | X | 0 | 233.05 | 254 | 233.05 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 5 | 1 | X | 1 | 224.42 | 57 | 183.13 | 41.29 | 16.42 | 1.32 | 2.25 | 0.62 | 0.00 | 0.00 | 20.69 |
| 6 | 1 | Boomers | 0 | 199.14 | 248 | 199.14 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 7 | 1 | Boomers | 1 | 210.34 | 56 | 164.01 | 46.33 | 20.78 | 1.34 | 0.76 | 0.00 | 0.92 | 0.00 | 22.53 |
| 8 | 1 | Silent | 0 | 237.84 | 24 | 237.84 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 |
| 9 | 1 | Silent | 1 | 201.69 | 10 | 143.80 | 57.89 | 26.00 | 7.50 | 0.00 | 0.00 | 0.00 | 0.00 | 24.39 |
#Converting month and weekday to integers
df_subset_ancll_spend["dep_month"]=df_subset_ancll_spend["dep_month"].astype(int)
df_subset_ancll_spend["day_of_week_dep"]=df_subset_ancll_spend["day_of_week_dep"].astype(int)
#Creating month and week column based on numbers
#month column
import calendar
df_subset_ancll_spend["month"] = df_subset_ancll_spend["dep_month"].apply(lambda x: calendar.month_name[x])
#Day column
days = {2:'Mon',3:'Tues',4:'Weds',5:'Thurs',6:'Fri',7:'Sat',1:'Sun'}
df_subset_ancll_spend['day'] = df_subset_ancll_spend['day_of_week_dep'] .apply(lambda x: days[x])
#Creating new dataframes to sort based on the month and week
df_ms = df_subset_ancll_spend.sort_values('dep_month', ascending = True).reset_index(drop=True)
df_ws = df_subset_ancll_spend.sort_values('day_of_week_dep', ascending = True).reset_index(drop=True)
# Total Revenue
# Draw Plot SEG_INTRVL_TYPE
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='Total_Revenue', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms, sort=True,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Total Revenue')
sns.lineplot(x='day', y='Total_Revenue',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws, sort=True, palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Total Revenue')
axes[0].set_title('Annual', fontsize=18);
axes[1].set_title('Weekly', fontsize=18)
plt.show()
# Flight Revenue
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='FLIGHT_REV', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Flight Revenue')
sns.lineplot(x='day', y='FLIGHT_REV',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Flight Revenue')
axes[0].set_title('Annual', fontsize=18);
axes[1].set_title('Weekly', fontsize=18)
plt.show()
# ancll_rev_ttl
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='ancll_rev_ttl', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary Revenue')
sns.lineplot(x='day', y='ancll_rev_ttl',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary Revenue')
axes[0].set_title('Annual', fontsize=18);
axes[1].set_title('Weekly', fontsize=18)
plt.show()
# rev_upgrade
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_upgrade', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary Upgrade Revenue')
sns.lineplot(x='day', y='rev_upgrade',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary Upgrade Revenue')
axes[0].set_title('Annual', fontsize=18);
axes[1].set_title('Weekly', fontsize=18)
plt.show()
# rev_tsa
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_tsa', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary TSA Revenue')
sns.lineplot(x='day', y='rev_tsa',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary TSA Revenue')
axes[0].set_title('Annual', fontsize=18);
axes[1].set_title('Weekly', fontsize=18)
plt.show()
# rev_mileage_multiplier
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_mileage_multiplier', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary MM Revenue')
sns.lineplot(x='day', y='rev_mileage_multiplier',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary MM Revenue')
axes[0].set_title('Annual', fontsize=18);
axes[1].set_title('Weekly', fontsize=18)
plt.show()
# rev_bag
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_bag', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary Bag Oth Revenue')
sns.lineplot(x='day', y='rev_bag',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary Bag Oth Revenue')
axes[0].set_title('Annual', fontsize=18);
axes[1].set_title('Weekly', fontsize=18)
plt.show()
# rev_pref_seats
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_pref_seats', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary Seat Revenue')
sns.lineplot(x='day', y='rev_pref_seats',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary Seat Revenue')
axes[0].set_title('Annual', fontsize=18);
axes[1].set_title('Weekly', fontsize=18)
plt.show()
# rev_pref_seats
# Draw Plot
fig, axes = plt.subplots(1, 2, figsize=(30,5), dpi= 80)
sns.lineplot(x='month', y='rev_bag_non_cat', hue=df_ms.SEG_INTRVL_TYPE,data=df_ms,palette='coolwarm',ci=None, ax=axes[0]).set(xlabel='Month', ylabel='Ancillary Bag Revenue')
sns.lineplot(x='day', y='rev_bag_non_cat',hue=df_ws.SEG_INTRVL_TYPE, data=df_ws,palette='coolwarm',ci=None).set(xlabel='Day of the Week', ylabel='Ancillary Bag Revenue')
axes[0].set_title('Annual', fontsize=18);
axes[1].set_title('Weekly', fontsize=18)
plt.show()
#Cabin upgrade on flighs with total revenue
sns.set_theme(style="dark")
sns.set_palette('coolwarm')
sns.barplot(x=df_subset_ancll_spend['SEG_INTRVL_TYPE'], y=df_subset_ancll_spend.Total_Revenue, hue=df_subset_ancll_spend.upgrade_ind, ci=None, palette='coolwarm').set(title='Total Revenue on Domestic and International Flight by Cabin Upgrade', ylabel='Total Revenue')
[Text(0.5, 1.0, 'Total Revenue on Domestic and International Flight by Cabin Upgrade'), Text(0, 0.5, 'Total Revenue')]
#Cabin upgrade on flighs with flight revenue
sns.barplot(x=df_subset_ancll_spend['SEG_INTRVL_TYPE'], y=df_subset_ancll_spend.FLIGHT_REV, hue=df_subset_ancll_spend.upgrade_ind, ci=None, palette='coolwarm').set(title='Total Revenue on Domestic and International Flight by Cabin Upgrade', ylabel='Flight Revenue')
[Text(0.5, 1.0, 'Total Revenue on Domestic and International Flight by Cabin Upgrade'), Text(0, 0.5, 'Flight Revenue')]
##Cabin upgrade on flighs with total ancillary revenue
sns.barplot(x=df_subset_ancll_spend['SEG_INTRVL_TYPE'], y=df_subset_ancll_spend.ancll_rev_ttl, hue=df_subset_ancll_spend.upgrade_ind, ci=None, palette='coolwarm').set(title='Total Revenue on Domestic and International Flight by Cabin Upgrade', ylabel='Ancillary Revenue')
[Text(0.5, 1.0, 'Total Revenue on Domestic and International Flight by Cabin Upgrade'), Text(0, 0.5, 'Ancillary Revenue')]
#Frequency of flight and upgrade information - Destination City
sns.set(rc={'figure.figsize':(10,5)})
ax = sns.countplot(x="city_arr", hue="upgrade_ind" ,data=df_subset_ancll_spend , palette='coolwarm', order=df_subset_ancll_spend.city_arr.value_counts().iloc[:7].index).set(title='Frequency of Flights from US Cities with Upgraded Cabin Information', xlabel='Destination City', ylabel='Frequency')
#Frequency of flight and upgrade information - Departure City
sns.set(rc={'figure.figsize':(10,5)})
ax = sns.countplot(x="city_dep", hue="upgrade_ind" ,data=df_subset_ancll_spend , palette='coolwarm', order=df_subset_ancll_spend.city_dep.value_counts().iloc[:7].index).set(title='Frequency of Flights from US Cities with Upgraded Cabin Information', xlabel='Departure City', ylabel='Frequency')
#Frequency of FLights to US Cities
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.countplot(x='city_arr', data=US_df,palette='coolwarm',order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Frequency of FLights to US Cities', xlabel='Destination City', ylabel='Frequency')
# Departure City upgrade and total revenue
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.countplot(x='city_dep', data=US_df, palette='coolwarm',order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Frequency of FLights from US Cities', xlabel='Departure City', ylabel='Frequency')
#International Total Revenue - Destination City
ax=sns.set(rc={'figure.figsize':(10,8)})
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="city_arr", y='Total_Revenue',hue='ancillary_spend_ind', data=overseas_df ,ci=None, palette='coolwarm', order=overseas_df.city_arr.value_counts().iloc[:7].index).set(title='International Total Revenue - Destination City', xlabel='Destination City', ylabel='Total Revenue')
#International Total Revenue - Departure City
ax=sns.set(rc={'figure.figsize':(10,8)})
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="city_dep", y='Total_Revenue',hue='ancillary_spend_ind', data=overseas_df ,ci=None, palette='coolwarm', order=overseas_df.city_dep.value_counts().iloc[:7].index).set(title='International Total Revenue - Departure City', xlabel='Departure City', ylabel='Total Revenue')
#Highest Total Revenue - Departure City
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="city_dep", y='Total_Revenue', hue='ancillary_spend_ind',data=US_df,ci=None,palette='coolwarm', order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Highest Total Revenue - Departure City', xlabel='Departure City', ylabel='Total Revenue')
#Highest Total Revenue - Destination City
ax=sns.set(rc={'figure.figsize':(10,8)})
sns.set_theme(style="whitegrid")
ax = sns.barplot(x="city_arr", y='Total_Revenue',hue='ancillary_spend_ind', data=US_df ,ci=None, palette='coolwarm', order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Highest Total Revenue - Destination City', xlabel='Destination City', ylabel='Total Revenue')
#Highest Total Revenue with Loyalty Level Information - Destination City
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.barplot(x="city_arr", y='Total_Revenue', hue="LYLTY_LEVEL_CD", palette='coolwarm', data=US_df,ci=None, order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Highest Total Revenue with Loyalty Level Information - Destination City', xlabel='Destination City', ylabel='Total Revenue')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
<matplotlib.legend.Legend at 0x7f17dbe71438>
#Highest Total Revenue with Loyalty Level Information - Departure City
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.barplot(x="city_dep", y='Total_Revenue', hue="LYLTY_LEVEL_CD", palette='coolwarm', data=US_df,ci=None,order=US_df.city_arr.value_counts().iloc[:7].index).set(title='Highest Total Revenue with Loyalty Level Information - Departure City', xlabel='Departure City', ylabel='Total Revenue')
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
<matplotlib.legend.Legend at 0x7f17db01dcc0>
# Highest Total Revenue from US to Domestic and International Flights
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.barplot(x="city_dep", y='Total_Revenue', hue="SEG_INTRVL_TYPE", ci=None,palette='coolwarm', data=df_subset_spend, order=df.city_arr.value_counts().iloc[:7].index).set(title='Highest Total Revenue from US to Domestic and International Flights', xlabel='Departure City', ylabel='Total Revenue')
#Highest Total Revenue from International Flights
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.barplot(x="city_arr", y='Total_Revenue', hue='ancillary_spend_ind', ci=None,palette='coolwarm', data=overseas_df,order=pd.value_counts(overseas_df['city_arr']).iloc[:7].index).set(title='Highest Total Revenue from International Flights', xlabel='Destination City', ylabel='Total Revenue')
#Highest Total Revenue from International Flights
sns.set(rc={'figure.figsize':(12,5)})
ax = sns.barplot(x="city_dep", y='Total_Revenue', hue='ancillary_spend_ind', ci=None,palette='coolwarm', data=overseas_df,order=pd.value_counts(overseas_df['city_dep']).iloc[:7].index).set(title='Highest Total Revenue from International Flights', xlabel='Departure City', ylabel='Total Revenue')
# Create frequency of flight
df_join.groupby(['DEP_CNTRY_CD',
'ARVL_CNTRY_CD',
'city_dep',
'city_arr',
'location_lat_dep',
'location_long_dep',
'location_lat_arr',
'location_long_arr'
]).size().reset_index(name="number_of_flight").to_csv('flight Frequency.csv')
flight_freq= pd.read_csv('flight frequency.csv')
#International Flights
overseas_df = flight_freq[(flight_freq["DEP_CNTRY_CD"] != flight_freq["ARVL_CNTRY_CD"]) & (flight_freq["ARVL_CNTRY_CD"] != "US")]
#print("International Travel Dataset Size After Filtering : ", overseas_df.shape)
#Domestic Flights
US_df = flight_freq[flight_freq["DEP_CNTRY_CD"] == flight_freq["ARVL_CNTRY_CD"]]
#print("International Travel Dataset Size After Filtering : ", US_df.shape)
#Reading World map
with plt.style.context(("seaborn", "ggplot")):
world = gpd.read_file(gpd.datasets.get_path("naturalearth_lowres"))
world.head()
| pop_est | continent | name | iso_a3 | gdp_md_est | geometry | |
|---|---|---|---|---|---|---|
| 0 | 920938 | Oceania | Fiji | FJI | 8374.00 | MULTIPOLYGON (((180.00000 -16.06713, 180.00000... |
| 1 | 53950935 | Africa | Tanzania | TZA | 150600.00 | POLYGON ((33.90371 -0.95000, 34.07262 -1.05982... |
| 2 | 603253 | Africa | W. Sahara | ESH | 906.50 | POLYGON ((-8.66559 27.65643, -8.66512 27.58948... |
| 3 | 35623680 | North America | Canada | CAN | 1674000.00 | MULTIPOLYGON (((-122.84000 49.00000, -122.9742... |
| 4 | 326625791 | North America | United States of America | USA | 18560000.00 | MULTIPOLYGON (((-122.84000 49.00000, -120.0000... |
with plt.style.context(("seaborn", "ggplot")):
## Plot world
world.plot(figsize=(60,10), edgecolor="blue", color="white");
## Loop through each flight plotting line depicting flight between source and destination
## We are also plotting scatter points depicting source and destinations
## Aprt from that we also have added logic for labels to destination cities.
for slat,dlat, slon, dlon, num_flights, src_city, dest_city in zip(overseas_df["location_lat_dep"], overseas_df["location_lat_arr"], overseas_df["location_long_dep"], overseas_df["location_long_arr"], overseas_df["number_of_flight"], overseas_df["city_dep"], overseas_df["city_arr"]):
plt.plot([slon , dlon], [slat, dlat], linewidth=num_flights/20, color="red", alpha=0.5)
plt.scatter( [slon, dlon], [slat, dlat], color="blue", alpha=1, s=num_flights/2)
#plt.text(slon+5, slat+5, src_city, fontsize=9, color="blue",alpha=1.0, horizontalalignment='center', verticalalignment='center')
#plt.text(dlon+5, dlat+5, dest_city, fontsize=9, color="black", alpha=1, horizontalalignment='center', verticalalignment='center')
plt.title("Connection Map Depicting Flights from US to All Other Countries")
#plt.savefig("connection-map-geopandas-3.png", dpi=100)
## Plot world
with plt.style.context(("seaborn", "ggplot")):
world[world.name == "United States of America"].plot(figsize=(15,15), edgecolor="blue", color="white");
## Loop through each flight plotting line depicting flight between source and destination
## We are also plotting scatter points depicting source and destinations.
## Aprt from that we also have added logic for labels of source and destination cities.
for slat,dlat, slon, dlon, num_flights, src_city, dest_city in zip(US_df["location_lat_dep"], US_df["location_lat_arr"], US_df["location_long_dep"], US_df["location_long_arr"], US_df["number_of_flight"], US_df["city_dep"], US_df["city_arr"]):
plt.plot([slon , dlon], [slat, dlat], linewidth=num_flights/300, color="lime", alpha=1)
plt.scatter( [slon, dlon], [slat, dlat], color="orangered", alpha=1, s=num_flights/10)
#plt.text(slon+0.5, slat+0.5, src_city, fontsize=9, color="dodgerblue", alpha=0.1, horizontalalignment='center', verticalalignment='center')
#plt.text(dlon+0.5, dlat+0.5, dest_city, fontsize=9, color="dodgerblue", alpha=0.1, horizontalalignment='center', verticalalignment='center')
plt.title("Connection Map Depicting Flights between Cities in United States")
#plt.savefig("connection-map-geopandas-5.png", dpi=100)